[clang] [llvm] [AMDGPU] Infer amdgpu-no-flat-scratch-init attribute in AMDGPUAttributor (PR #94647)

Jun Wang via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 31 17:07:43 PDT 2024


https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/94647

>From 3030189e247592c9d893af0aa8681656b5269226 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 6 Jun 2024 10:45:12 -0700
Subject: [PATCH 01/13] [AMDGPU] Infer amdgpu-no-flat-scratch-init attribute in
 AMDGPUAttributor

The AMDGPUAnnotateKernelFeatures pass infers the "amdgpu-calls" and
"amdgpu-stack-objects" attributes, which are used to infer whether we need to
initialize flat scratch. This is, however, not precise. Instead, we should use
AMDGPUAttributor and infer amdgpu-no-flat-scratch-init on kernels.
Refer to https://github.com/llvm/llvm-project/issues/63586 .
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributes.def   |    1 +
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |   43 +
 ...licit-kernarg-backend-usage-global-isel.ll |   30 +-
 .../AMDGPU/addrspacecast-constantexpr.ll      |    6 +-
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       |   21 +-
 .../annotate-kernel-features-hsa-call.ll      |  112 +-
 .../AMDGPU/annotate-kernel-features-hsa.ll    |   44 +-
 .../AMDGPU/annotate-kernel-features.ll        |   18 +-
 .../attributor-flatscratchinit-globalisel.ll  | 1028 ++++++++++++
 .../AMDGPU/attributor-flatscratchinit.ll      |  914 +++++++++++
 llvm/test/CodeGen/AMDGPU/attributor-noopt.ll  |    2 +-
 .../AMDGPU/call-graph-register-usage.ll       |   12 +-
 .../callee-special-input-sgprs-fixed-abi.ll   |   36 +-
 .../CodeGen/AMDGPU/direct-indirect-call.ll    |    4 +-
 .../AMDGPU/duplicate-attribute-indirect.ll    |    4 +-
 .../test/CodeGen/AMDGPU/flat-address-space.ll |    8 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |   31 +-
 .../AMDGPU/implicitarg-offset-attributes.ll   |   30 +-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |    2 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  |  248 +--
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |   29 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |   15 +-
 .../AMDGPU/lower-module-lds-via-table.ll      |   15 +-
 .../AMDGPU/memory-legalizer-flat-agent.ll     | 1380 +++++++++++++++++
 .../memory-legalizer-flat-nontemporal.ll      |   75 +
 .../memory-legalizer-flat-singlethread.ll     | 1380 +++++++++++++++++
 .../AMDGPU/memory-legalizer-flat-system.ll    | 1380 +++++++++++++++++
 .../AMDGPU/memory-legalizer-flat-volatile.ll  |   66 +
 .../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ++++++++++++++++
 .../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ++++++++++++++++
 .../AMDGPU/memory-legalizer-global-agent.ll   |  273 ++++
 .../memory-legalizer-global-nontemporal.ll    |   15 +
 .../memory-legalizer-global-singlethread.ll   |  276 ++++
 .../AMDGPU/memory-legalizer-global-system.ll  |  261 ++++
 .../memory-legalizer-global-volatile.ll       |   18 +
 .../memory-legalizer-global-wavefront.ll      |  276 ++++
 .../memory-legalizer-global-workgroup.ll      |  276 ++++
 .../memory-legalizer-local-nontemporal.ll     |    9 +
 .../AMDGPU/memory-legalizer-local-volatile.ll |    6 +
 .../memory-legalizer-private-nontemporal.ll   |   59 +-
 .../memory-legalizer-private-volatile.ll      |   30 +-
 .../AMDGPU/propagate-flat-work-group-size.ll  |   18 +-
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  |   44 +-
 .../AMDGPU/recursive_global_initializer.ll    |    2 +-
 .../AMDGPU/remove-no-kernel-id-attribute.ll   |    8 +-
 .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll     |    2 +-
 .../CodeGen/AMDGPU/simple-indirect-call.ll    |    4 +-
 .../uniform-work-group-attribute-missing.ll   |    4 +-
 .../AMDGPU/uniform-work-group-multistep.ll    |    4 +-
 ...niform-work-group-nested-function-calls.ll |    4 +-
 ...ork-group-prevent-attribute-propagation.ll |    6 +-
 .../uniform-work-group-propagate-attribute.ll |    4 +-
 .../uniform-work-group-recursion-test.ll      |    6 +-
 .../CodeGen/AMDGPU/uniform-work-group-test.ll |    4 +-
 .../AMDGPU/vgpr-spill-placement-issue61083.ll |    2 +-
 55 files changed, 10760 insertions(+), 470 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
index bacc8e4e821e5d..8c1c8219690ba5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
@@ -30,5 +30,6 @@ AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z")
 AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id")
 AMDGPU_ATTRIBUTE(DEFAULT_QUEUE, "amdgpu-no-default-queue")
 AMDGPU_ATTRIBUTE(COMPLETION_ACTION, "amdgpu-no-completion-action")
+AMDGPU_ATTRIBUTE(FLAT_SCRATCH_INIT, "amdgpu-no-flat-scratch-init")
 
 #undef AMDGPU_ATTRIBUTE
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 687a7339da379d..76eba1aa9ffcd9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -439,6 +439,19 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       indicatePessimisticFixpoint();
       return;
     }
+
+    bool HasAllocaOrASCast = false;
+    for (BasicBlock &BB : *F) {
+      for (Instruction &I : BB) {
+        if (isa<AllocaInst>(I) || isa<AddrSpaceCastInst>(I)) {
+          HasAllocaOrASCast = true;
+          removeAssumedBits(FLAT_SCRATCH_INIT);
+          break;
+        }
+      }
+      if (HasAllocaOrASCast)
+        break;
+    }
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
@@ -525,6 +538,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
       removeAssumedBits(COMPLETION_ACTION);
 
+    if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
+      removeAssumedBits(FLAT_SCRATCH_INIT);
+
     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
                                        : ChangeStatus::UNCHANGED;
   }
@@ -683,6 +699,33 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
                                               UsedAssumedInformation);
   }
+
+  // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
+  // not to be set.
+  bool needFlatScratchInit(Attributor &A) {
+    // This is called on each callee; false means callee shouldn't have
+    // no-flat-scratch-init.
+    auto CheckForNoFlatScratchInit = [&](Instruction &I) {
+      const auto &CB = cast<CallBase>(I);
+      const Value *CalleeOp = CB.getCalledOperand();
+      const Function *Callee = dyn_cast<Function>(CalleeOp);
+      if (!Callee) // indirect call
+        return CB.isInlineAsm();
+
+      if (Callee->isIntrinsic())
+        return true;
+
+      const auto *CalleeInfo = A.getAAFor<AAAMDAttributes>(
+          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
+      return CalleeInfo && CalleeInfo->isAssumed(FLAT_SCRATCH_INIT);
+    };
+
+    bool UsedAssumedInformation = false;
+    // If any callee is false (i.e. need FlatScratchInit),
+    // checkForAllCallLikeInstructions returns false
+    return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
+                                              UsedAssumedInformation);
+  }
 };
 
 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 9443b39dcdc033..80e9ae33d6d45e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -10,9 +10,11 @@
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
+; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_mov_b32 s4, s0
 ; GFX8V4-NEXT:    s_mov_b32 s5, s3
@@ -23,6 +25,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], s[6:7], 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
@@ -35,9 +38,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX8V5-LABEL: addrspacecast:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
-; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_mov_b32 s4, s0
 ; GFX8V5-NEXT:    s_mov_b32 s5, s2
@@ -47,6 +52,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], s[2:3], 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
@@ -59,10 +65,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_mov_b32 s2, s0
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
@@ -71,6 +78,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
@@ -83,10 +91,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_mov_b32 s2, s0
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
@@ -95,6 +104,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index cff9ce05066793..96bbcb7ed2149a 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -233,9 +233,9 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index e5d440b96349f6..05ee8cabb5e7c4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -116,7 +116,7 @@ define amdgpu_kernel void @kernel_calls_extern() {
 define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
 ; CHECK-SAME: ) #[[ATTR4]] {
-; CHECK-NEXT:    call void @unknown() #[[ATTR9:[0-9]+]]
+; CHECK-NEXT:    call void @unknown() #[[ATTR10:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   call void @unknown() #0
@@ -136,7 +136,7 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
 define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
 ; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
-; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR9]]
+; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR10]]
 ; CHECK-NEXT:    ret void
 ;
   call void %indirect() #0
@@ -229,7 +229,7 @@ define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
 
 define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
-; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
@@ -254,14 +254,15 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 
 attributes #0 = { "amdgpu-no-agpr" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-no-agpr" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 3d4ae84d9c698e..23294eacbe6cb3 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -530,7 +530,7 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -545,7 +545,7 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -611,7 +611,7 @@ define void @use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -628,7 +628,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_implicitarg_ptr()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -645,7 +645,7 @@ define internal void @defined.func() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -658,7 +658,7 @@ define void @func_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -673,7 +673,7 @@ define void @func_call_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -687,8 +687,8 @@ define void @func_call_asm() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
-; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR28:[0-9]+]]
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
+; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR31:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void asm sideeffect "", ""() #3
@@ -702,7 +702,7 @@ define amdgpu_kernel void @kern_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -717,7 +717,7 @@ define amdgpu_kernel void @func_kern_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -769,7 +769,7 @@ define float @func_indirect_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -788,7 +788,7 @@ define float @func_extern_call() #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @extern()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -806,7 +806,7 @@ define float @func_null_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float null()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -827,7 +827,7 @@ define float @func_other_intrinsic_call(float %arg) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
-; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR20]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -845,7 +845,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -861,7 +861,7 @@ define void @func_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -877,7 +877,7 @@ define void @func_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -893,7 +893,7 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -912,7 +912,7 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @extern_func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -928,7 +928,7 @@ define internal void @enqueue_block_def() #6 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR28:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -941,7 +941,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR29:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_decl()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -956,7 +956,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_def()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -969,7 +969,7 @@ define void @unused_enqueue_block() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -980,7 +980,7 @@ define internal void @known_func() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -994,8 +994,8 @@ define amdgpu_kernel void @kern_callsite_enqueue_block() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
-; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR29:[0-9]+]]
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
+; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR32:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void @known_func() #6
@@ -1024,36 +1024,40 @@ attributes #6 = { "enqueued-block" }
 ; AKF_HSA: attributes #[[ATTR7]] = { "enqueued-block" }
 ; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" }
 ;.
+
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind }
-; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR26:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR30]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR31]] = { nounwind }
+; ATTRIBUTOR_HSA: attributes #[[ATTR32]] = { "enqueued-block" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 43cdf85ed3818c..e30e013e3e3d8f 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -454,7 +454,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 
 define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_group_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(3) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -466,7 +466,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_private_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -503,7 +503,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
 
 define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_global_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -515,7 +515,7 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_constant_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
 ; HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -534,7 +534,7 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -555,7 +555,7 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -575,7 +575,7 @@ define amdgpu_kernel void @use_alloca() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -596,7 +596,7 @@ define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
 ; ATTRIBUTOR_HSA-NEXT:  entry:
 ; ATTRIBUTOR_HSA-NEXT:    br label [[BB:%.*]]
 ; ATTRIBUTOR_HSA:       bb:
@@ -621,7 +621,7 @@ define void @use_alloca_func() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -643,19 +643,21 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index 547ff69592ca0e..89fe46d975309a 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -398,13 +398,13 @@ attributes #1 = { nounwind }
 ; AKF_CHECK: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
new file mode 100644
index 00000000000000..ce5a3eedb5ebb1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -0,0 +1,1028 @@
+; Test the generation of the attribute amdgpu-no-flat-scratch-init
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator < %s | FileCheck -check-prefixes=GFX10 %s
+
+;; tests of alloca
+
+define void @without_alloca(i1 %arg0) {
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @with_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
+
+define amdgpu_vs void @with_alloca_cc_vs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gs void @with_alloca_cc_gs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ps void @with_alloca_cc_ps() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs void @with_alloca_cc_cs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_hs void @with_alloca_cc_hs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ls void @with_alloca_cc_ls() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_es void @with_alloca_cc_es() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gfx void @with_alloca_cc_gfx() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define void @call_without_alloca() {
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_cc_kernel() {
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define void @call_with_alloca() {
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_alloca_cc_kernel() {
+  call void @with_alloca()
+  ret void
+}
+
+define void @call_both_with_and_without_alloca() {
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define void @call_call_without_alloca() {
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @call_call_with_alloca() {
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
+  call void @call_with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_without_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_with_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_without_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_with_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+;; tests of addrspacecast
+
+define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of mixed alloca and addrspacecast
+
+define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of indirect call, intrinsics
+
+ at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define void @with_indirect_call() {
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define amdgpu_kernel void @with_indirect_call_cc_kernel() {
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define void @call_with_indirect_call() {
+  call void @with_indirect_call()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
+  call void @with_indirect_call()
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+
+define void @use_intrinsic_workitem_id_x() {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define void @call_use_intrinsic_workitem_id_x() {
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+; GFX10: name:            without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_cc_vs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_gs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
+;
+; GFX10: name:            with_alloca_cc_ps
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_cs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_hs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
+;
+; GFX10: name:            with_alloca_cc_ls
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_es
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_gfx
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:  psInputAddr:     0
+;
+; GFX10: name:            with_alloca_cc_cs_chain
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
+; GFX10-NEXT:  psInputAddr:     0
+;
+; GFX10: name:            with_alloca_cc_cs_chain_preserve
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
+; GFX10-NEXT:  psInputAddr:     0
+;
+; GFX10: name:            call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
+;
+; GFX10: name:            call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_both_with_and_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_both_with_and_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
+;
+; GFX10: name:            call_call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            without_global_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_global_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_global_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_global_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_region_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_region_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_region_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_region_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_group_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_group_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_group_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_group_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_constant_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_constant_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_constant_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_constant_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_both_with_and_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_call_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_call_with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_call_both_with_and_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            with_cast_call_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_cast_call_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            with_cast_call_with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_cast_call_with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_without_alloca_and_without_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_alloca_and_without_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_without_alloca_and_with_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_alloca_and_with_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            with_indirect_call
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_indirect_call_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr8_sgpr9' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_indirect_call
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_indirect_call_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr8_sgpr9' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            use_intrinsic_workitem_id_x
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            use_intrinsic_workitem_id_x_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
+;
+; GFX10: name:            call_use_intrinsic_workitem_id_x
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_use_intrinsic_workitem_id_x_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
new file mode 100644
index 00000000000000..c0d700cc37464b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -0,0 +1,914 @@
+; Test the generation of the attribute amdgpu-no-flat-scratch-init
+; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
+
+;; tests of alloca
+
+define void @without_alloca(i1 %arg0) {
+; GFX9-LABEL: define void @without_alloca(i1 %arg0)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_alloca(i1 %arg0)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @with_alloca() {
+; GFX9-LABEL: define void @with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
+; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
+
+define amdgpu_vs void @with_alloca_cc_vs() {
+; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gs void @with_alloca_cc_gs() {
+; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ps void @with_alloca_cc_ps() {
+; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs void @with_alloca_cc_cs() {
+; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_hs void @with_alloca_cc_hs() {
+; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ls void @with_alloca_cc_ls() {
+; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_es void @with_alloca_cc_es() {
+; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gfx void @with_alloca_cc_gfx() {
+; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
+; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
+; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define void @call_without_alloca() {
+; GFX9-LABEL: define void @call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define void @call_with_alloca() {
+; GFX9-LABEL: define void @call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @with_alloca()
+  ret void
+}
+
+define void @call_both_with_and_without_alloca() {
+; GFX9-LABEL: define void @call_both_with_and_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_both_with_and_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define void @call_call_without_alloca() {
+; GFX9-LABEL: define void @call_call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @call_call_with_alloca() {
+; GFX9-LABEL: define void @call_call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @call_with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_without_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_with_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_without_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_with_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+;; tests of addrspacecast
+
+define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of mixed alloca and addrspacecast
+
+define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of indirect call, intrinsics
+
+ at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define void @with_indirect_call() {
+; GFX9-LABEL: define void @with_indirect_call()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_indirect_call()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]] {
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define amdgpu_kernel void @with_indirect_call_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define void @call_with_indirect_call() {
+; GFX9-LABEL: define void @call_with_indirect_call()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_with_indirect_call()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]]
+  call void @with_indirect_call()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+  call void @with_indirect_call()
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+
+define void @use_intrinsic_workitem_id_x() {
+; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+;
+; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define void @call_use_intrinsic_workitem_id_x() {
+; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+;GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]]  = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 90562e25a3e9c1..470c444ad8cd74 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -23,7 +23,7 @@
 ; COV5: .amdhsa_user_sgpr_queue_ptr 0
 ; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
-; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
+; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1
 ; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
 ; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index dbd00f09943c01..0d66ea55a04378 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -33,9 +33,9 @@ define void @indirect_use_vcc() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
-; CI: ; TotalNumSgprs: 38
-; VI-NOBUG: ; TotalNumSgprs: 40
-; VI-BUG: ; TotalNumSgprs: 96
+; CI: ; NumSgprs: 36
+; VI-NOBUG: ; NumSgprs: 36
+; VI-BUG: ; NumSgprs: 96
 ; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
   call void @indirect_use_vcc()
@@ -121,9 +121,9 @@ define void @indirect_use_80_sgpr() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
-; CI: ; TotalNumSgprs: 84
-; VI-NOBUG: ; TotalNumSgprs: 86
-; VI-BUG: ; TotalNumSgprs: 96
+; CI: ; NumSgprs: 82
+; VI-NOBUG: ; NumSgprs: 82
+; VI-BUG: ; NumSgprs: 96
 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
   call void @indirect_use_80_sgpr()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 8ef2d89e76d4e1..25a5207d3bbb7c 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -198,11 +198,11 @@ define hidden void @use_workgroup_id_yz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
 ; GCN-NOT: s6
-; GCN: s_mov_b32 s12, s6
+; GCN: s_mov_b32 s12, s4
 ; GCN: s_mov_b32 s32, 0
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x at rel32@hi+12
+; GCN: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x at rel32@hi+12
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 
@@ -216,7 +216,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
 ; GCN-NOT: s12
-; GCN: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s13, s5
 ; GCN-NOT: s12
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -232,7 +232,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
 ; GCN-NOT: s12
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s14, s5
 ; GCN-NOT: s12
 ; GCN-NOT: s13
 
@@ -249,8 +249,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
 ; GCN-NOT: s14
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s13, s5
 ; GCN-NOT: s14
 
 ; GCN: s_mov_b32 s32, 0
@@ -265,9 +265,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
-; GCN: s_mov_b32 s12, s6
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s12, s4
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 
@@ -282,8 +282,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
 
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s14, s5
 ; GCN-NOT: s13
 
 ; GCN: s_mov_b32 s32, 0
@@ -299,8 +299,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
 
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -381,7 +381,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
 
 ; GCN-NOT: s13
 ; GCN-NOT: s14
-; GCN-DAG: s_mov_b32 s12, s6
+; GCN-DAG: s_mov_b32 s12, s4
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
 ; GCN-NOT: s13
 ; GCN-NOT: s14
@@ -399,7 +399,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s13, s7
+; GCN-DAG: s_mov_b32 s13, s5
 
 ; GCN-DAG: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -414,7 +414,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s14, s7
+; GCN-DAG: s_mov_b32 s14, s5
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index aa182b720c6042..ebca9906998787 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 848019c8729251..074dba1cbcc931 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -42,6 +42,6 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
 ;.
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 0ad53083d0ff3f..eb3d356ea59b44 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @store_flat_i32(ptr addrspace(1) %gptr, i32 %x) #0 {
 }
 
 ; GCN-LABEL: {{^}}store_flat_i64:
-; GCN: flat_store_{{dwordx2|b64}}
+; GCN: flat_store_{{dword|b64}}
 define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   store volatile i64 %x, ptr %fptr, align 8
@@ -28,7 +28,7 @@ define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 {
 }
 
 ; GCN-LABEL: {{^}}store_flat_v4i32:
-; GCN: flat_store_{{dwordx4|b128}}
+; GCN: flat_store_{{dword|b128}}
 define amdgpu_kernel void @store_flat_v4i32(ptr addrspace(1) %gptr, <4 x i32> %x) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   store volatile <4 x i32> %x, ptr %fptr, align 16
@@ -65,7 +65,7 @@ define amdgpu_kernel void @load_flat_i32(ptr addrspace(1) noalias %out, ptr addr
 }
 
 ; GCN-LABEL: load_flat_i64:
-; GCN: flat_load_{{dwordx2|b64}}
+; GCN: flat_load_{{dword|b64}}
 define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   %fload = load volatile i64, ptr %fptr, align 8
@@ -74,7 +74,7 @@ define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addr
 }
 
 ; GCN-LABEL: load_flat_v4i32:
-; GCN: flat_load_{{dwordx4|b128}}
+; GCN: flat_load_{{dword|b128}}
 define amdgpu_kernel void @load_flat_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   %fload = load volatile <4 x i32>, ptr %fptr, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index b89dbd42e0466f..c998a4b19121e1 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -10,9 +10,11 @@
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
+; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V4-NEXT:    s_cselect_b32 s3, s3, 0
@@ -22,6 +24,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8V4-NEXT:    s_cselect_b32 s0, s2, 0
 ; GFX8V4-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v4
@@ -33,9 +36,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX8V5-LABEL: addrspacecast:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
-; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V5-NEXT:    s_cselect_b32 s2, s2, 0
@@ -45,6 +50,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8V5-NEXT:    s_cselect_b32 s0, s3, 0
 ; GFX8V5-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v4
@@ -56,10 +62,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V4-NEXT:    s_cselect_b32 s2, s3, 0
@@ -69,6 +76,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V4-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V4-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v4
@@ -80,10 +88,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V5-NEXT:    s_cselect_b32 s2, s3, 0
@@ -93,6 +102,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V5-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V5-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v4
@@ -101,6 +111,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    flat_store_dword v[2:3], v0
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
+
   %flat.private = addrspacecast ptr addrspace(5) %ptr.private to ptr
   %flat.local = addrspacecast ptr addrspace(3) %ptr.local to ptr
   store volatile i32 1, ptr %flat.private
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index 4c21f872974559..5979f1bbacdd14 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -258,25 +258,25 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
 
 ;.
 ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 957f404c8cdbed..a2f55d6aa83968 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
 ; GCN-NOT: writelane
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; TotalNumSgprs: 37
+; GCN: ; NumSgprs: 33
 ; GCN: ; NumVgprs: 9
 define amdgpu_kernel void @kernel_call() #0 {
   %vgpr = load volatile i32, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index d51ace630f6925..aebc8315514fbc 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -113,24 +113,16 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
-; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -176,24 +168,16 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
-; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -239,24 +223,16 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
-; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -302,24 +278,16 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
-; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -352,12 +320,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -371,8 +334,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 0
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v4, v3
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -385,24 +348,15 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    s_mov_b32 s15, 4
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -429,12 +383,7 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -448,8 +397,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 2
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v4, v3
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -462,24 +411,15 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    s_mov_b32 s15, 6
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -506,12 +446,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -525,8 +460,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v4, v3
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -539,24 +474,15 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    s_mov_b32 s15, 5
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -583,12 +509,7 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -602,8 +523,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 3
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v4, v3
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -616,24 +537,15 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    s_mov_b32 s15, 7
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index c201f84cac7268..8fadfe3d026669 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -42,28 +42,19 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 ; GCN-LABEL: indirect_lds_id:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; GCN-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s8, s6, 8
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT:    s_addc_u32 s9, s7, 0
-; GCN-NEXT:    s_getpc_b64 s[6:7]
-; GCN-NEXT:    s_add_u32 s6, s6, function_lds_id at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s7, s7, function_lds_id at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_getpc_b64 s[8:9]
+; GCN-NEXT:    s_add_u32 s8, s8, function_lds_id at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s9, s9, function_lds_id at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
 ; GCN-NEXT:    s_mov_b32 s15, 21
+; GCN-NEXT:    s_mov_b32 s12, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    s_endpgm
   call void @function_lds_id(ptr addrspace(1) %out)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 9bbcc6988e311f..05ad567478675c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -161,10 +161,7 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -200,10 +197,7 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -250,10 +244,7 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 72a0aceaae12b6..3453ff9d296c09 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -226,10 +226,7 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -273,10 +270,7 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -323,10 +317,7 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 1c33d8a19890db..7868fa9a7ce4c9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX7-LABEL: flat_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -202,6 +217,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_load(
 ; GFX7-LABEL: flat_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -389,6 +419,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX7-LABEL: flat_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -405,6 +438,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -423,6 +460,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -456,6 +497,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -471,6 +514,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -591,6 +636,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX7-LABEL: flat_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -608,6 +656,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -628,6 +680,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -664,6 +720,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -680,6 +738,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -819,6 +879,9 @@ entry:
 define amdgpu_kernel void @flat_agent_unordered_store(
 ; GFX7-LABEL: flat_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -830,6 +893,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -842,6 +909,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -865,6 +936,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -876,6 +949,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -959,6 +1034,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_store(
 ; GFX7-LABEL: flat_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -970,6 +1048,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -982,6 +1064,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1005,6 +1091,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1016,6 +1104,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1099,6 +1189,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_store(
 ; GFX7-LABEL: flat_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1111,6 +1204,10 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1125,6 +1222,10 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1151,6 +1252,8 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1163,6 +1266,8 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1265,6 +1370,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_store(
 ; GFX7-LABEL: flat_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1277,6 +1385,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1291,6 +1403,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1317,6 +1433,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1329,6 +1447,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1431,6 +1551,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1443,6 +1566,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1456,6 +1583,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1481,6 +1612,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1493,6 +1626,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1585,6 +1720,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1599,6 +1737,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1616,6 +1758,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1646,6 +1792,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1660,6 +1808,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1770,6 +1920,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1783,6 +1936,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1798,6 +1955,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1826,6 +1987,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1839,6 +2002,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1950,6 +2115,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1965,6 +2133,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1984,6 +2156,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2017,6 +2193,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2032,6 +2210,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2161,6 +2341,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2176,6 +2359,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2195,6 +2382,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2228,6 +2419,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2243,6 +2436,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2372,6 +2567,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2388,6 +2586,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2406,6 +2608,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2439,6 +2645,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2454,6 +2662,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2572,6 +2782,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2589,6 +2802,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2609,6 +2826,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2645,6 +2866,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2661,6 +2884,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2802,6 +3027,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2819,6 +3047,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2839,6 +3071,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2875,6 +3111,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2891,6 +3129,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3032,6 +3272,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3057,6 +3300,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3082,6 +3329,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3132,6 +3383,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3147,6 +3400,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3265,6 +3520,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3292,6 +3550,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3321,6 +3583,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3376,6 +3642,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3393,6 +3661,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3529,6 +3799,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3555,6 +3828,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3582,6 +3859,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3635,6 +3916,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3651,6 +3934,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3788,6 +4073,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3816,6 +4104,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3847,6 +4139,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3905,6 +4201,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3923,6 +4221,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4078,6 +4378,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4106,6 +4409,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4137,6 +4444,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4195,6 +4506,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4213,6 +4526,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4368,6 +4683,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4395,6 +4713,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4424,6 +4746,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4479,6 +4805,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4496,6 +4824,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4632,6 +4962,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4659,6 +4992,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4688,6 +5025,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4743,6 +5084,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4760,6 +5103,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4896,6 +5241,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4924,6 +5272,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4955,6 +5307,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5013,6 +5369,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5031,6 +5389,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5186,6 +5546,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5214,6 +5577,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5245,6 +5612,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5303,6 +5674,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5321,6 +5694,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5476,6 +5851,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5504,6 +5882,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5535,6 +5917,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5593,6 +5979,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5611,6 +5999,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5766,6 +6156,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5794,6 +6187,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5825,6 +6222,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5883,6 +6284,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5901,6 +6304,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6056,6 +6461,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6084,6 +6492,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6115,6 +6527,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6173,6 +6589,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6191,6 +6609,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6346,6 +6766,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6374,6 +6797,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6405,6 +6832,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6463,6 +6894,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6481,6 +6914,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6636,6 +7071,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6664,6 +7102,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6695,6 +7137,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6753,6 +7199,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6771,6 +7219,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6926,6 +7376,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6954,6 +7407,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6985,6 +7442,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7043,6 +7504,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7061,6 +7524,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7216,6 +7681,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7244,6 +7712,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7272,6 +7744,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7328,6 +7804,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7346,6 +7824,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7491,6 +7971,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7520,6 +8003,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7550,6 +8037,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7608,6 +8099,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7627,6 +8120,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7781,6 +8276,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7810,6 +8308,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7840,6 +8342,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7899,6 +8405,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7918,6 +8426,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8082,6 +8592,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8112,6 +8625,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8144,6 +8661,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8205,6 +8726,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8225,6 +8748,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8402,6 +8927,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8432,6 +8960,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8464,6 +8996,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8525,6 +9061,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8545,6 +9083,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8722,6 +9262,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8751,6 +9294,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8781,6 +9328,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8839,6 +9390,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8858,6 +9411,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9016,6 +9571,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9045,6 +9603,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9075,6 +9637,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9133,6 +9699,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9152,6 +9720,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9306,6 +9876,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9336,6 +9909,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9368,6 +9945,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9429,6 +10010,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9449,6 +10032,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9626,6 +10211,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9656,6 +10244,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9688,6 +10280,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9749,6 +10345,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9769,6 +10367,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9946,6 +10546,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9976,6 +10579,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10008,6 +10615,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10069,6 +10680,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10089,6 +10702,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10266,6 +10881,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10296,6 +10914,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10328,6 +10950,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10389,6 +11015,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10409,6 +11037,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10586,6 +11216,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10616,6 +11249,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10648,6 +11285,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10709,6 +11350,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10729,6 +11372,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10902,6 +11547,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10932,6 +11580,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10964,6 +11616,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11025,6 +11681,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11045,6 +11703,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11222,6 +11882,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11252,6 +11915,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11284,6 +11951,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11345,6 +12016,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11365,6 +12038,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11542,6 +12217,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11572,6 +12250,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11604,6 +12286,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11665,6 +12351,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11685,6 +12373,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11862,6 +12552,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ; GFX7-LABEL: flat_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11877,6 +12570,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11893,6 +12590,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11924,6 +12625,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11938,6 +12641,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12049,6 +12754,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12064,6 +12772,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12080,6 +12792,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12111,6 +12827,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12125,6 +12843,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12236,6 +12956,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX7-LABEL: flat_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12253,6 +12976,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12272,6 +12999,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12307,6 +13038,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12323,6 +13056,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12448,6 +13183,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12466,6 +13204,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12487,6 +13229,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12525,6 +13271,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12542,6 +13290,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12686,6 +13436,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ; GFX7-LABEL: flat_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12697,6 +13450,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12709,6 +13466,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12732,6 +13493,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12743,6 +13506,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12826,6 +13591,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12837,6 +13605,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12849,6 +13621,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12872,6 +13648,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12883,6 +13661,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12966,6 +13746,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_store(
 ; GFX7-LABEL: flat_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12978,6 +13761,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12992,6 +13779,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13018,6 +13809,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13030,6 +13823,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13132,6 +13927,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13144,6 +13942,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13158,6 +13960,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13184,6 +13990,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13196,6 +14004,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13298,6 +14108,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13310,6 +14123,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13323,6 +14140,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13348,6 +14169,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13360,6 +14183,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13452,6 +14277,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13466,6 +14294,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13482,6 +14314,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13511,6 +14347,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13525,6 +14363,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13633,6 +14473,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13646,6 +14489,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13661,6 +14508,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13689,6 +14540,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13702,6 +14555,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13813,6 +14668,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13828,6 +14686,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13846,6 +14708,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13878,6 +14744,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13893,6 +14761,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14020,6 +14890,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14035,6 +14908,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14053,6 +14930,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14085,6 +14966,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14100,6 +14983,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14227,6 +15112,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14244,6 +15132,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14263,6 +15155,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14298,6 +15194,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14314,6 +15212,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14437,6 +15337,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14455,6 +15358,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14476,6 +15383,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14514,6 +15425,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14531,6 +15444,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14677,6 +15592,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14695,6 +15613,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14716,6 +15638,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14754,6 +15680,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14771,6 +15699,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14917,6 +15847,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14942,6 +15875,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14967,6 +15904,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15017,6 +15958,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15032,6 +15975,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15150,6 +16095,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15177,6 +16125,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15205,6 +16157,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15259,6 +16215,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15276,6 +16234,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15410,6 +16370,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15436,6 +16399,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15463,6 +16430,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15516,6 +16487,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15532,6 +16505,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15669,6 +16644,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15697,6 +16675,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15727,6 +16709,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15784,6 +16770,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15802,6 +16790,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15955,6 +16945,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15983,6 +16976,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16013,6 +17010,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16070,6 +17071,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16088,6 +17091,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16241,6 +17246,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16268,6 +17276,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16296,6 +17308,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16350,6 +17366,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16367,6 +17385,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16501,6 +17521,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16528,6 +17551,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16556,6 +17583,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16610,6 +17641,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16627,6 +17660,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16761,6 +17796,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16789,6 +17827,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16819,6 +17861,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16876,6 +17922,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16894,6 +17942,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17047,6 +18097,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17075,6 +18128,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17105,6 +18162,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17162,6 +18223,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17180,6 +18243,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17333,6 +18398,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17361,6 +18429,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17391,6 +18463,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17448,6 +18524,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17466,6 +18544,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17619,6 +18699,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17647,6 +18730,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17677,6 +18764,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17734,6 +18825,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17752,6 +18845,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17905,6 +19000,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17933,6 +19031,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17963,6 +19065,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18020,6 +19126,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18038,6 +19146,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18191,6 +19301,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18219,6 +19332,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18249,6 +19366,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18306,6 +19427,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18324,6 +19447,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18477,6 +19602,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18505,6 +19633,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18535,6 +19667,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18592,6 +19728,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18610,6 +19748,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18763,6 +19903,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18791,6 +19934,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18821,6 +19968,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18878,6 +20029,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18896,6 +20049,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19049,6 +20204,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19077,6 +20235,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19105,6 +20267,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19161,6 +20327,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19179,6 +20347,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19324,6 +20494,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19354,6 +20527,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19385,6 +20562,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19445,6 +20626,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19465,6 +20648,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19624,6 +20809,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19653,6 +20841,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19683,6 +20875,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19742,6 +20938,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19761,6 +20959,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19925,6 +21125,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19956,6 +21159,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19989,6 +21196,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20052,6 +21263,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20073,6 +21286,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20255,6 +21470,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20286,6 +21504,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20319,6 +21541,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20382,6 +21608,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20403,6 +21631,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20585,6 +21815,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20615,6 +21848,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20646,6 +21883,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20706,6 +21947,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20726,6 +21969,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20889,6 +22134,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20919,6 +22167,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20950,6 +22202,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21010,6 +22266,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21030,6 +22288,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21189,6 +22449,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21220,6 +22483,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21253,6 +22520,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21316,6 +22587,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21337,6 +22610,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21519,6 +22794,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21550,6 +22828,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21583,6 +22865,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21646,6 +22932,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21667,6 +22955,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21849,6 +23139,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21880,6 +23173,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21913,6 +23210,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21976,6 +23277,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21997,6 +23300,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22179,6 +23484,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22210,6 +23518,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22243,6 +23555,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22306,6 +23622,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22327,6 +23645,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22509,6 +23829,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22540,6 +23863,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22573,6 +23900,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22636,6 +23967,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22657,6 +23990,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22835,6 +24170,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22866,6 +24204,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22899,6 +24241,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22962,6 +24308,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22983,6 +24331,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23165,6 +24515,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23196,6 +24549,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23229,6 +24586,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23292,6 +24653,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23313,6 +24676,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23495,6 +24860,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23526,6 +24894,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23559,6 +24931,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23622,6 +24998,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23643,6 +25021,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index b2340caa2933f9..f189562bafe5f7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -202,6 +217,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -230,6 +248,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -257,6 +279,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -312,6 +338,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -341,6 +369,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -551,6 +581,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -566,6 +599,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -582,6 +619,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -613,6 +654,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -627,6 +670,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -738,6 +783,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -766,6 +814,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -794,6 +846,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -850,6 +906,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -880,6 +938,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1091,6 +1151,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ; GFX7-LABEL: flat_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -1107,6 +1170,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1124,6 +1191,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1157,6 +1228,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1172,6 +1245,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 304c80d7bb24d4..089d2a69facc17 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX7-LABEL: flat_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -202,6 +217,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -389,6 +419,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_load(
 ; GFX7-LABEL: flat_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -404,6 +437,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -420,6 +457,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -451,6 +492,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -465,6 +508,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -576,6 +621,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -591,6 +639,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -607,6 +659,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -638,6 +694,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -652,6 +710,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -763,6 +823,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_unordered_store(
 ; GFX7-LABEL: flat_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -774,6 +837,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -786,6 +853,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -809,6 +880,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -820,6 +893,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -903,6 +978,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -914,6 +992,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -926,6 +1008,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -949,6 +1035,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -960,6 +1048,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1043,6 +1133,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_store(
 ; GFX7-LABEL: flat_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1054,6 +1147,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1066,6 +1163,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1089,6 +1190,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1100,6 +1203,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1183,6 +1288,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1194,6 +1302,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1206,6 +1318,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1229,6 +1345,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1240,6 +1358,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1323,6 +1443,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1335,6 +1458,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1348,6 +1475,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1373,6 +1504,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1385,6 +1518,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1477,6 +1612,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1489,6 +1627,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1502,6 +1644,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1527,6 +1673,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1539,6 +1687,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1631,6 +1781,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1643,6 +1796,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1656,6 +1813,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1681,6 +1842,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1693,6 +1856,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1785,6 +1950,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1797,6 +1965,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1810,6 +1982,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1835,6 +2011,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1847,6 +2025,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1939,6 +2119,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1951,6 +2134,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1964,6 +2151,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1989,6 +2180,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2001,6 +2194,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2093,6 +2288,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2108,6 +2306,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2124,6 +2326,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2155,6 +2361,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2169,6 +2377,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2278,6 +2488,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2293,6 +2506,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2309,6 +2526,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2340,6 +2561,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2354,6 +2577,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2463,6 +2688,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2478,6 +2706,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2494,6 +2726,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2525,6 +2761,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2539,6 +2777,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2648,6 +2888,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2673,6 +2916,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2698,6 +2945,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2748,6 +2999,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2763,6 +3016,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2881,6 +3136,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2906,6 +3164,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2931,6 +3193,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2981,6 +3247,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2996,6 +3264,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3114,6 +3384,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3139,6 +3412,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3164,6 +3441,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3214,6 +3495,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3229,6 +3512,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3347,6 +3632,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3372,6 +3660,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3397,6 +3689,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3447,6 +3743,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3462,6 +3760,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3580,6 +3880,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3605,6 +3908,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3630,6 +3937,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3680,6 +3991,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3695,6 +4008,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3813,6 +4128,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3838,6 +4156,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3863,6 +4185,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3913,6 +4239,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3928,6 +4256,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4046,6 +4376,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4071,6 +4404,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4096,6 +4433,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4146,6 +4487,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4161,6 +4504,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4279,6 +4624,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4304,6 +4652,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4329,6 +4681,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4379,6 +4735,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4394,6 +4752,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4512,6 +4872,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4537,6 +4900,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4562,6 +4929,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4612,6 +4983,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4627,6 +5000,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4745,6 +5120,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4770,6 +5148,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4795,6 +5177,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4845,6 +5231,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4860,6 +5248,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4978,6 +5368,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5003,6 +5396,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5028,6 +5425,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5078,6 +5479,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5093,6 +5496,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5211,6 +5616,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5236,6 +5644,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5261,6 +5673,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5311,6 +5727,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5326,6 +5744,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5444,6 +5864,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5469,6 +5892,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5494,6 +5921,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5544,6 +5975,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5559,6 +5992,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5677,6 +6112,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5702,6 +6140,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5727,6 +6169,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5777,6 +6223,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5792,6 +6240,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5910,6 +6360,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5935,6 +6388,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5960,6 +6417,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6010,6 +6471,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6025,6 +6488,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6143,6 +6608,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6171,6 +6639,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6199,6 +6671,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6255,6 +6731,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6273,6 +6751,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6418,6 +6898,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6446,6 +6929,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6474,6 +6961,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6530,6 +7021,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6548,6 +7041,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6693,6 +7188,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6721,6 +7219,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6749,6 +7251,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6805,6 +7311,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6823,6 +7331,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6968,6 +7478,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6996,6 +7509,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7024,6 +7541,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7080,6 +7601,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7098,6 +7621,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7243,6 +7768,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7271,6 +7799,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7299,6 +7831,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7355,6 +7891,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7373,6 +7911,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7518,6 +8058,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7546,6 +8089,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7574,6 +8121,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7630,6 +8181,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7648,6 +8201,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7793,6 +8348,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7821,6 +8379,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7849,6 +8411,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7905,6 +8471,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7923,6 +8491,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8068,6 +8638,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8096,6 +8669,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8124,6 +8701,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8180,6 +8761,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8198,6 +8781,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8343,6 +8928,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8371,6 +8959,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8399,6 +8991,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8455,6 +9051,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8473,6 +9071,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8618,6 +9218,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8646,6 +9249,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8674,6 +9281,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8730,6 +9341,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8748,6 +9361,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8893,6 +9508,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8921,6 +9539,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8949,6 +9571,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9005,6 +9631,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9023,6 +9651,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9168,6 +9798,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9196,6 +9829,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9224,6 +9861,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9280,6 +9921,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9298,6 +9941,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9443,6 +10088,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9471,6 +10119,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9499,6 +10151,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9555,6 +10211,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9573,6 +10231,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9718,6 +10378,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9746,6 +10409,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9774,6 +10441,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9830,6 +10501,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9848,6 +10521,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9993,6 +10668,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10021,6 +10699,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10049,6 +10731,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10105,6 +10791,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10123,6 +10811,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10268,6 +10958,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10283,6 +10976,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10299,6 +10996,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10330,6 +11031,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10344,6 +11047,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10455,6 +11160,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10470,6 +11178,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10486,6 +11198,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10517,6 +11233,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10531,6 +11249,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10642,6 +11362,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10657,6 +11380,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10673,6 +11400,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10704,6 +11435,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10718,6 +11451,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10829,6 +11564,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10844,6 +11582,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10860,6 +11602,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10891,6 +11637,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10905,6 +11653,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11016,6 +11766,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11027,6 +11780,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11039,6 +11796,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11062,6 +11823,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11073,6 +11836,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11156,6 +11921,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11167,6 +11935,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11179,6 +11951,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11202,6 +11978,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11213,6 +11991,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11296,6 +12076,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ; GFX7-LABEL: flat_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11307,6 +12090,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11319,6 +12106,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11342,6 +12133,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11353,6 +12146,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11436,6 +12231,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11447,6 +12245,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11459,6 +12261,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11482,6 +12288,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11493,6 +12301,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11576,6 +12386,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11588,6 +12401,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11601,6 +12418,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11626,6 +12447,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11638,6 +12461,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11730,6 +12555,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11742,6 +12570,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11755,6 +12587,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11780,6 +12616,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11792,6 +12630,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11884,6 +12724,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11896,6 +12739,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11909,6 +12756,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11934,6 +12785,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11946,6 +12799,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12038,6 +12893,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12050,6 +12908,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12063,6 +12925,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12088,6 +12954,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12100,6 +12968,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12192,6 +13062,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12204,6 +13077,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12217,6 +13094,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12242,6 +13123,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12254,6 +13137,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12346,6 +13231,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12361,6 +13249,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12377,6 +13269,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12408,6 +13304,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12422,6 +13320,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12531,6 +13431,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12546,6 +13449,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12562,6 +13469,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12593,6 +13504,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12607,6 +13520,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12716,6 +13631,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12731,6 +13649,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12747,6 +13669,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12778,6 +13704,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12792,6 +13720,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12901,6 +13831,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -12926,6 +13859,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12951,6 +13888,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13001,6 +13942,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13016,6 +13959,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13134,6 +14079,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13159,6 +14107,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13184,6 +14136,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13234,6 +14190,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13249,6 +14207,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13367,6 +14327,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13392,6 +14355,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13417,6 +14384,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13467,6 +14438,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13482,6 +14455,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13600,6 +14575,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13625,6 +14603,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13650,6 +14632,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13700,6 +14686,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13715,6 +14703,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13833,6 +14823,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13858,6 +14851,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13883,6 +14880,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13933,6 +14934,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13948,6 +14951,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14066,6 +15071,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14091,6 +15099,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14116,6 +15128,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14166,6 +15182,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14181,6 +15199,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14299,6 +15319,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14324,6 +15347,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14349,6 +15376,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14399,6 +15430,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14414,6 +15447,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14532,6 +15567,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14557,6 +15595,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14582,6 +15624,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14632,6 +15678,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14647,6 +15695,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14765,6 +15815,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14790,6 +15843,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14815,6 +15872,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14865,6 +15926,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14880,6 +15943,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14998,6 +16063,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15023,6 +16091,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15048,6 +16120,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15098,6 +16174,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15113,6 +16191,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15231,6 +16311,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15256,6 +16339,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15281,6 +16368,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15331,6 +16422,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15346,6 +16439,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15464,6 +16559,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15489,6 +16587,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15514,6 +16616,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15564,6 +16670,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15579,6 +16687,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15697,6 +16807,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15722,6 +16835,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15747,6 +16864,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15797,6 +16918,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15812,6 +16935,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15930,6 +17055,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15955,6 +17083,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15980,6 +17112,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16030,6 +17166,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16045,6 +17183,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16163,6 +17303,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16188,6 +17331,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16213,6 +17360,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16263,6 +17414,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16278,6 +17431,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16396,6 +17551,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16424,6 +17582,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16452,6 +17614,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16508,6 +17674,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16526,6 +17694,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16671,6 +17841,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16699,6 +17872,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16727,6 +17904,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16783,6 +17964,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16801,6 +17984,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16946,6 +18131,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16974,6 +18162,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17002,6 +18194,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17058,6 +18254,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17076,6 +18274,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17221,6 +18421,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17249,6 +18452,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17277,6 +18484,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17333,6 +18544,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17351,6 +18564,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17496,6 +18711,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17524,6 +18742,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17552,6 +18774,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17608,6 +18834,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17626,6 +18854,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17771,6 +19001,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17799,6 +19032,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17827,6 +19064,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17883,6 +19124,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17901,6 +19144,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18046,6 +19291,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18074,6 +19322,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18102,6 +19354,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18158,6 +19414,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18176,6 +19434,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18321,6 +19581,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18349,6 +19612,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18377,6 +19644,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18433,6 +19704,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18451,6 +19724,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18596,6 +19871,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18624,6 +19902,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18652,6 +19934,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18708,6 +19994,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18726,6 +20014,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18871,6 +20161,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18899,6 +20192,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18927,6 +20224,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18983,6 +20284,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19001,6 +20304,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19146,6 +20451,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19174,6 +20482,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19202,6 +20514,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19258,6 +20574,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19276,6 +20594,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19421,6 +20741,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19449,6 +20772,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19477,6 +20804,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19533,6 +20864,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19551,6 +20884,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19696,6 +21031,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19724,6 +21062,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19752,6 +21094,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19808,6 +21154,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19826,6 +21174,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19971,6 +21321,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19999,6 +21352,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20027,6 +21384,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20083,6 +21444,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20101,6 +21464,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20246,6 +21611,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20274,6 +21642,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20302,6 +21674,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20358,6 +21734,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20376,6 +21754,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 038b58deb0cf19..69c03ba6a39796 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX7-LABEL: flat_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -202,6 +217,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_load(
 ; GFX7-LABEL: flat_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -389,6 +419,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX7-LABEL: flat_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -405,6 +438,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -423,6 +460,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -456,6 +497,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -472,6 +515,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -593,6 +638,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX7-LABEL: flat_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -610,6 +658,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -630,6 +682,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -666,6 +722,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -683,6 +741,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -823,6 +883,9 @@ entry:
 define amdgpu_kernel void @flat_system_unordered_store(
 ; GFX7-LABEL: flat_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -834,6 +897,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -846,6 +913,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -869,6 +940,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -880,6 +953,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -963,6 +1038,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_store(
 ; GFX7-LABEL: flat_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -974,6 +1052,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -986,6 +1068,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1009,6 +1095,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1020,6 +1108,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1103,6 +1193,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_store(
 ; GFX7-LABEL: flat_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1115,6 +1208,10 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1129,6 +1226,10 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1155,6 +1256,8 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1168,6 +1271,8 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1271,6 +1376,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_store(
 ; GFX7-LABEL: flat_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1283,6 +1391,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1297,6 +1409,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1323,6 +1439,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1336,6 +1454,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1439,6 +1559,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1451,6 +1574,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1464,6 +1591,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1489,6 +1620,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1501,6 +1634,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1593,6 +1728,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1607,6 +1745,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1624,6 +1766,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1654,6 +1800,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1669,6 +1817,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1780,6 +1930,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_atomicrmw(
 ; GFX7-LABEL: flat_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1793,6 +1946,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1808,6 +1965,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1836,6 +1997,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1850,6 +2013,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1962,6 +2127,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1977,6 +2145,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1996,6 +2168,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2029,6 +2205,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2046,6 +2224,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2177,6 +2357,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2192,6 +2375,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2211,6 +2398,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2244,6 +2435,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2261,6 +2454,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2392,6 +2587,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2408,6 +2606,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2426,6 +2628,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2459,6 +2665,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2475,6 +2683,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2594,6 +2804,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2611,6 +2824,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2631,6 +2848,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2667,6 +2888,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2685,6 +2908,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2828,6 +3053,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2845,6 +3073,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2865,6 +3097,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2901,6 +3137,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2919,6 +3157,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3062,6 +3302,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3087,6 +3330,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3112,6 +3359,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3162,6 +3413,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3177,6 +3430,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3295,6 +3550,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3322,6 +3580,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3351,6 +3613,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3406,6 +3672,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3424,6 +3692,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3561,6 +3831,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3587,6 +3860,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3614,6 +3891,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3667,6 +3948,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3684,6 +3967,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3822,6 +4107,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3850,6 +4138,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3881,6 +4173,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3939,6 +4235,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3959,6 +4257,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4116,6 +4416,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4144,6 +4447,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4175,6 +4482,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4233,6 +4544,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4253,6 +4566,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4410,6 +4725,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4437,6 +4755,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4466,6 +4788,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4521,6 +4847,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4539,6 +4867,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4676,6 +5006,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4703,6 +5036,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4732,6 +5069,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4787,6 +5128,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4805,6 +5148,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4942,6 +5287,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4970,6 +5318,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5001,6 +5353,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5059,6 +5415,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5079,6 +5437,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5236,6 +5596,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5264,6 +5627,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5295,6 +5662,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5353,6 +5724,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5373,6 +5746,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5530,6 +5905,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5558,6 +5936,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5589,6 +5971,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5647,6 +6033,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5667,6 +6055,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5824,6 +6214,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5852,6 +6245,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5883,6 +6280,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5941,6 +6342,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5961,6 +6364,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6118,6 +6523,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6146,6 +6554,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6177,6 +6589,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6235,6 +6651,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6255,6 +6673,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6412,6 +6832,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6440,6 +6863,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6471,6 +6898,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6529,6 +6960,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6549,6 +6982,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6706,6 +7141,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6734,6 +7172,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6765,6 +7207,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6823,6 +7269,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6843,6 +7291,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7000,6 +7450,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7028,6 +7481,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7059,6 +7516,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7117,6 +7578,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7137,6 +7600,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7294,6 +7759,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7322,6 +7790,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7350,6 +7822,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7406,6 +7882,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7424,6 +7902,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7569,6 +8049,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7598,6 +8081,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7628,6 +8115,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7686,6 +8177,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7706,6 +8199,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7861,6 +8356,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7890,6 +8388,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7920,6 +8422,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7979,6 +8485,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7999,6 +8507,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8164,6 +8674,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8194,6 +8707,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8226,6 +8743,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8287,6 +8808,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8309,6 +8832,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8488,6 +9013,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8518,6 +9046,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8550,6 +9082,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8611,6 +9147,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8633,6 +9171,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8812,6 +9352,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8841,6 +9384,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8871,6 +9418,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8929,6 +9480,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8949,6 +9502,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9108,6 +9663,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9137,6 +9695,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9167,6 +9729,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9225,6 +9791,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9245,6 +9813,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9400,6 +9970,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9430,6 +10003,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9462,6 +10039,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9523,6 +10104,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9545,6 +10128,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9724,6 +10309,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9754,6 +10342,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9786,6 +10378,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9847,6 +10443,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9869,6 +10467,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10048,6 +10648,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10078,6 +10681,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10110,6 +10717,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10171,6 +10782,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10193,6 +10806,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10372,6 +10987,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10402,6 +11020,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10434,6 +11056,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10495,6 +11121,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10517,6 +11145,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10696,6 +11326,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10726,6 +11359,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10758,6 +11395,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10819,6 +11460,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10841,6 +11484,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11016,6 +11661,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11046,6 +11694,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11078,6 +11730,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11139,6 +11795,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11161,6 +11819,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11340,6 +12000,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11370,6 +12033,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11402,6 +12069,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11463,6 +12134,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11485,6 +12158,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11664,6 +12339,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11694,6 +12372,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11726,6 +12408,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11787,6 +12473,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11809,6 +12497,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11988,6 +12678,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ; GFX7-LABEL: flat_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12003,6 +12696,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12019,6 +12716,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12050,6 +12751,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12064,6 +12767,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12175,6 +12880,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ; GFX7-LABEL: flat_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12190,6 +12898,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12206,6 +12918,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12237,6 +12953,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12251,6 +12969,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12362,6 +13082,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX7-LABEL: flat_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12379,6 +13102,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12398,6 +13125,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12433,6 +13164,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12450,6 +13183,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12576,6 +13311,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12594,6 +13332,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12615,6 +13357,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12653,6 +13399,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12671,6 +13419,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12816,6 +13566,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ; GFX7-LABEL: flat_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12827,6 +13580,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12839,6 +13596,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12862,6 +13623,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12873,6 +13636,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12956,6 +13721,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ; GFX7-LABEL: flat_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12967,6 +13735,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12979,6 +13751,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13002,6 +13778,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13013,6 +13791,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13096,6 +13876,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_store(
 ; GFX7-LABEL: flat_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13108,6 +13891,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13122,6 +13909,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13148,6 +13939,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13161,6 +13954,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13264,6 +14059,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13276,6 +14074,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13290,6 +14092,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13316,6 +14122,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13329,6 +14137,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13432,6 +14242,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13444,6 +14257,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13457,6 +14274,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13482,6 +14303,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13494,6 +14317,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13586,6 +14411,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13600,6 +14428,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13616,6 +14448,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13645,6 +14481,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13660,6 +14498,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13769,6 +14609,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13782,6 +14625,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13797,6 +14644,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13825,6 +14676,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13839,6 +14692,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13951,6 +14806,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13966,6 +14824,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13984,6 +14846,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14016,6 +14882,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14033,6 +14901,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14162,6 +15032,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14177,6 +15050,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14195,6 +15072,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14227,6 +15108,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14244,6 +15127,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14373,6 +15258,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14390,6 +15278,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14409,6 +15301,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14444,6 +15340,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14461,6 +15359,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14585,6 +15485,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14603,6 +15506,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14624,6 +15531,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14662,6 +15573,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14681,6 +15594,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14829,6 +15744,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14847,6 +15765,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14868,6 +15790,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14906,6 +15832,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14925,6 +15853,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15073,6 +16003,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15098,6 +16031,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15123,6 +16060,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15173,6 +16114,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15188,6 +16131,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15306,6 +16251,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15333,6 +16281,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15361,6 +16313,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15415,6 +16371,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15433,6 +16391,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15568,6 +16528,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15594,6 +16557,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15621,6 +16588,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15674,6 +16645,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15691,6 +16664,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15829,6 +16804,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15857,6 +16835,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15887,6 +16869,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15944,6 +16930,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15964,6 +16952,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16119,6 +17109,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16147,6 +17140,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16177,6 +17174,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16234,6 +17235,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16254,6 +17257,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16409,6 +17414,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16436,6 +17444,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16464,6 +17476,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16518,6 +17534,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16536,6 +17554,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16671,6 +17691,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16698,6 +17721,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16726,6 +17753,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16780,6 +17811,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16798,6 +17831,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16933,6 +17968,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16961,6 +17999,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16991,6 +18033,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17048,6 +18094,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17068,6 +18116,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17223,6 +18273,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17251,6 +18304,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17281,6 +18338,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17338,6 +18399,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17358,6 +18421,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17513,6 +18578,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17541,6 +18609,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17571,6 +18643,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17628,6 +18704,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17648,6 +18726,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17803,6 +18883,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17831,6 +18914,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17861,6 +18948,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17918,6 +19009,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17938,6 +19031,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18093,6 +19188,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18121,6 +19219,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18151,6 +19253,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18208,6 +19314,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18228,6 +19336,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18383,6 +19493,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18411,6 +19524,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18441,6 +19558,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18498,6 +19619,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18518,6 +19641,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18673,6 +19798,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18701,6 +19829,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18731,6 +19863,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18788,6 +19924,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18808,6 +19946,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18963,6 +20103,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18991,6 +20134,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19021,6 +20168,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19078,6 +20229,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19098,6 +20251,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19253,6 +20408,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19281,6 +20439,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19309,6 +20471,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19365,6 +20531,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19383,6 +20551,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19528,6 +20698,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19558,6 +20731,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19589,6 +20766,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19649,6 +20830,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19670,6 +20853,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19830,6 +21015,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19859,6 +21047,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19889,6 +21081,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19948,6 +21144,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19968,6 +21166,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20133,6 +21333,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20164,6 +21367,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20197,6 +21404,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20260,6 +21471,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20283,6 +21496,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20467,6 +21682,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20498,6 +21716,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20531,6 +21753,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20594,6 +21820,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20617,6 +21845,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20801,6 +22031,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20831,6 +22064,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20862,6 +22099,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20922,6 +22163,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20943,6 +22186,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21107,6 +22352,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21137,6 +22385,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21168,6 +22420,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21228,6 +22484,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21249,6 +22507,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21409,6 +22669,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21440,6 +22703,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21473,6 +22740,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21536,6 +22807,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21559,6 +22832,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21743,6 +23018,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21774,6 +23052,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21807,6 +23089,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21870,6 +23156,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21893,6 +23181,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22077,6 +23367,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22108,6 +23401,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22141,6 +23438,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22204,6 +23505,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22227,6 +23530,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22411,6 +23716,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22442,6 +23750,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22475,6 +23787,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22538,6 +23854,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22561,6 +23879,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22745,6 +24065,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22776,6 +24099,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22809,6 +24136,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22872,6 +24203,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22895,6 +24228,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23075,6 +24410,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23106,6 +24444,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23139,6 +24481,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23202,6 +24548,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23225,6 +24573,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23409,6 +24759,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23440,6 +24793,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23473,6 +24830,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23536,6 +24897,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23559,6 +24922,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23743,6 +25108,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23774,6 +25142,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23807,6 +25179,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23870,6 +25246,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23893,6 +25271,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index f1b465c1789da7..5f788e2e41ac55 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -11,6 +11,9 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -27,6 +30,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -44,6 +51,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -154,6 +165,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -183,6 +197,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -211,6 +229,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -411,6 +433,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -427,6 +452,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -444,6 +473,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -558,6 +591,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -587,6 +623,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -616,6 +656,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -821,6 +865,9 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -837,6 +884,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -854,6 +905,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -960,6 +1015,9 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX7-LABEL: flat_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -972,6 +1030,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -986,6 +1048,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 23982f8a00cdb8..dad713198cc89a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX7-LABEL: flat_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -202,6 +217,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -389,6 +419,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_load(
 ; GFX7-LABEL: flat_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -404,6 +437,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -420,6 +457,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -451,6 +492,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -465,6 +508,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -576,6 +621,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -591,6 +639,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -607,6 +659,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -638,6 +694,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -652,6 +710,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -763,6 +823,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_unordered_store(
 ; GFX7-LABEL: flat_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -774,6 +837,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -786,6 +853,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -809,6 +880,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -820,6 +893,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -903,6 +978,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -914,6 +992,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -926,6 +1008,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -949,6 +1035,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -960,6 +1048,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1043,6 +1133,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_store(
 ; GFX7-LABEL: flat_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1054,6 +1147,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1066,6 +1163,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1089,6 +1190,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1100,6 +1203,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1183,6 +1288,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1194,6 +1302,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1206,6 +1318,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1229,6 +1345,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1240,6 +1358,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1323,6 +1443,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1335,6 +1458,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1348,6 +1475,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1373,6 +1504,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1385,6 +1518,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1477,6 +1612,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1489,6 +1627,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1502,6 +1644,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1527,6 +1673,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1539,6 +1687,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1631,6 +1781,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1643,6 +1796,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1656,6 +1813,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1681,6 +1842,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1693,6 +1856,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1785,6 +1950,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1797,6 +1965,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1810,6 +1982,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1835,6 +2011,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1847,6 +2025,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1939,6 +2119,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1951,6 +2134,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1964,6 +2151,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1989,6 +2180,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2001,6 +2194,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2093,6 +2288,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2108,6 +2306,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2124,6 +2326,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2155,6 +2361,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2169,6 +2377,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2278,6 +2488,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2293,6 +2506,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2309,6 +2526,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2340,6 +2561,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2354,6 +2577,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2463,6 +2688,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2478,6 +2706,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2494,6 +2726,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2525,6 +2761,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2539,6 +2777,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2648,6 +2888,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2673,6 +2916,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2698,6 +2945,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2748,6 +2999,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2763,6 +3016,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2881,6 +3136,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2906,6 +3164,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2931,6 +3193,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2981,6 +3247,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2996,6 +3264,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3114,6 +3384,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3139,6 +3412,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3164,6 +3441,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3214,6 +3495,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3229,6 +3512,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3347,6 +3632,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3372,6 +3660,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3397,6 +3689,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3447,6 +3743,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3462,6 +3760,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3580,6 +3880,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3605,6 +3908,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3630,6 +3937,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3680,6 +3991,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3695,6 +4008,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3813,6 +4128,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3838,6 +4156,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3863,6 +4185,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3913,6 +4239,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3928,6 +4256,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4046,6 +4376,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4071,6 +4404,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4096,6 +4433,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4146,6 +4487,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4161,6 +4504,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4279,6 +4624,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4304,6 +4652,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4329,6 +4681,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4379,6 +4735,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4394,6 +4752,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4512,6 +4872,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4537,6 +4900,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4562,6 +4929,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4612,6 +4983,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4627,6 +5000,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4745,6 +5120,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4770,6 +5148,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4795,6 +5177,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4845,6 +5231,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4860,6 +5248,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4978,6 +5368,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5003,6 +5396,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5028,6 +5425,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5078,6 +5479,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5093,6 +5496,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5211,6 +5616,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5236,6 +5644,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5261,6 +5673,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5311,6 +5727,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5326,6 +5744,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5444,6 +5864,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5469,6 +5892,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5494,6 +5921,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5544,6 +5975,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5559,6 +5992,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5677,6 +6112,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5702,6 +6140,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5727,6 +6169,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5777,6 +6223,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5792,6 +6240,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5910,6 +6360,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5935,6 +6388,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5960,6 +6417,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6010,6 +6471,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6025,6 +6488,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6143,6 +6608,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6171,6 +6639,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6199,6 +6671,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6255,6 +6731,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6273,6 +6751,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6418,6 +6898,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6446,6 +6929,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6474,6 +6961,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6530,6 +7021,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6548,6 +7041,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6693,6 +7188,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6721,6 +7219,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6749,6 +7251,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6805,6 +7311,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6823,6 +7331,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6968,6 +7478,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6996,6 +7509,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7024,6 +7541,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7080,6 +7601,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7098,6 +7621,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7243,6 +7768,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7271,6 +7799,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7299,6 +7831,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7355,6 +7891,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7373,6 +7911,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7518,6 +8058,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7546,6 +8089,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7574,6 +8121,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7630,6 +8181,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7648,6 +8201,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7793,6 +8348,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7821,6 +8379,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7849,6 +8411,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7905,6 +8471,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7923,6 +8491,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8068,6 +8638,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8096,6 +8669,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8124,6 +8701,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8180,6 +8761,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8198,6 +8781,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8343,6 +8928,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8371,6 +8959,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8399,6 +8991,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8455,6 +9051,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8473,6 +9071,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8618,6 +9218,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8646,6 +9249,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8674,6 +9281,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8730,6 +9341,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8748,6 +9361,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8893,6 +9508,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8921,6 +9539,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8949,6 +9571,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9005,6 +9631,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9023,6 +9651,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9168,6 +9798,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9196,6 +9829,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9224,6 +9861,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9280,6 +9921,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9298,6 +9941,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9443,6 +10088,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9471,6 +10119,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9499,6 +10151,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9555,6 +10211,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9573,6 +10231,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9718,6 +10378,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9746,6 +10409,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9774,6 +10441,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9830,6 +10501,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9848,6 +10521,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9993,6 +10668,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10021,6 +10699,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10049,6 +10731,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10105,6 +10791,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10123,6 +10811,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10268,6 +10958,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10283,6 +10976,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10299,6 +10996,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10330,6 +11031,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10344,6 +11047,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10455,6 +11160,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10470,6 +11178,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10486,6 +11198,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10517,6 +11233,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10531,6 +11249,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10642,6 +11362,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10657,6 +11380,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10673,6 +11400,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10704,6 +11435,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10718,6 +11451,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10829,6 +11564,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10844,6 +11582,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10860,6 +11602,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10891,6 +11637,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10905,6 +11653,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11016,6 +11766,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11027,6 +11780,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11039,6 +11796,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11062,6 +11823,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11073,6 +11836,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11156,6 +11921,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11167,6 +11935,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11179,6 +11951,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11202,6 +11978,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11213,6 +11991,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11296,6 +12076,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ; GFX7-LABEL: flat_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11307,6 +12090,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11319,6 +12106,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11342,6 +12133,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11353,6 +12146,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11436,6 +12231,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11447,6 +12245,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11459,6 +12261,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11482,6 +12288,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11493,6 +12301,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11576,6 +12386,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11588,6 +12401,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11601,6 +12418,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11626,6 +12447,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11638,6 +12461,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11730,6 +12555,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11742,6 +12570,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11755,6 +12587,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11780,6 +12616,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11792,6 +12630,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11884,6 +12724,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11896,6 +12739,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11909,6 +12756,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11934,6 +12785,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11946,6 +12799,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12038,6 +12893,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12050,6 +12908,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12063,6 +12925,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12088,6 +12954,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12100,6 +12968,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12192,6 +13062,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12204,6 +13077,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12217,6 +13094,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12242,6 +13123,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12254,6 +13137,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12346,6 +13231,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12361,6 +13249,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12377,6 +13269,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12408,6 +13304,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12422,6 +13320,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12531,6 +13431,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12546,6 +13449,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12562,6 +13469,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12593,6 +13504,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12607,6 +13520,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12716,6 +13631,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12731,6 +13649,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12747,6 +13669,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12778,6 +13704,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12792,6 +13720,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12901,6 +13831,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -12926,6 +13859,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12951,6 +13888,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13001,6 +13942,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13016,6 +13959,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13134,6 +14079,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13159,6 +14107,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13184,6 +14136,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13234,6 +14190,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13249,6 +14207,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13367,6 +14327,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13392,6 +14355,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13417,6 +14384,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13467,6 +14438,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13482,6 +14455,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13600,6 +14575,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13625,6 +14603,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13650,6 +14632,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13700,6 +14686,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13715,6 +14703,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13833,6 +14823,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13858,6 +14851,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13883,6 +14880,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13933,6 +14934,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13948,6 +14951,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14066,6 +15071,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14091,6 +15099,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14116,6 +15128,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14166,6 +15182,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14181,6 +15199,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14299,6 +15319,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14324,6 +15347,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14349,6 +15376,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14399,6 +15430,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14414,6 +15447,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14532,6 +15567,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14557,6 +15595,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14582,6 +15624,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14632,6 +15678,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14647,6 +15695,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14765,6 +15815,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14790,6 +15843,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14815,6 +15872,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14865,6 +15926,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14880,6 +15943,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14998,6 +16063,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15023,6 +16091,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15048,6 +16120,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15098,6 +16174,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15113,6 +16191,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15231,6 +16311,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15256,6 +16339,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15281,6 +16368,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15331,6 +16422,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15346,6 +16439,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15464,6 +16559,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15489,6 +16587,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15514,6 +16616,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15564,6 +16670,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15579,6 +16687,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15697,6 +16807,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15722,6 +16835,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15747,6 +16864,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15797,6 +16918,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15812,6 +16935,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15930,6 +17055,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15955,6 +17083,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15980,6 +17112,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16030,6 +17166,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16045,6 +17183,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16163,6 +17303,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16188,6 +17331,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16213,6 +17360,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16263,6 +17414,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16278,6 +17431,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16396,6 +17551,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16424,6 +17582,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16452,6 +17614,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16508,6 +17674,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16526,6 +17694,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16671,6 +17841,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16699,6 +17872,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16727,6 +17904,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16783,6 +17964,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16801,6 +17984,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16946,6 +18131,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16974,6 +18162,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17002,6 +18194,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17058,6 +18254,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17076,6 +18274,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17221,6 +18421,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17249,6 +18452,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17277,6 +18484,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17333,6 +18544,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17351,6 +18564,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17496,6 +18711,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17524,6 +18742,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17552,6 +18774,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17608,6 +18834,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17626,6 +18854,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17771,6 +19001,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17799,6 +19032,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17827,6 +19064,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17883,6 +19124,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17901,6 +19144,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18046,6 +19291,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18074,6 +19322,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18102,6 +19354,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18158,6 +19414,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18176,6 +19434,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18321,6 +19581,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18349,6 +19612,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18377,6 +19644,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18433,6 +19704,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18451,6 +19724,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18596,6 +19871,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18624,6 +19902,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18652,6 +19934,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18708,6 +19994,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18726,6 +20014,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18871,6 +20161,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18899,6 +20192,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18927,6 +20224,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18983,6 +20284,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19001,6 +20304,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19146,6 +20451,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19174,6 +20482,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19202,6 +20514,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19258,6 +20574,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19276,6 +20594,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19421,6 +20741,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19449,6 +20772,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19477,6 +20804,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19533,6 +20864,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19551,6 +20884,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19696,6 +21031,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19724,6 +21062,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19752,6 +21094,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19808,6 +21154,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19826,6 +21174,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19971,6 +21321,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19999,6 +21352,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20027,6 +21384,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20083,6 +21444,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20101,6 +21464,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 5ddabad7374ddd..a350394bcafe50 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX7-LABEL: flat_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -202,6 +217,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -217,6 +235,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -233,6 +255,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -264,6 +290,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -278,6 +306,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -389,6 +419,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_load(
 ; GFX7-LABEL: flat_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -405,6 +438,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -422,6 +459,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -455,6 +496,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -470,6 +513,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -588,6 +633,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -605,6 +653,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -624,6 +676,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -659,6 +715,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -675,6 +733,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -806,6 +866,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_unordered_store(
 ; GFX7-LABEL: flat_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -817,6 +880,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -829,6 +896,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -852,6 +923,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -863,6 +936,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -946,6 +1021,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -957,6 +1035,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -969,6 +1051,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -992,6 +1078,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1003,6 +1091,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1086,6 +1176,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX7-LABEL: flat_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1098,6 +1191,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1112,6 +1209,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1137,6 +1238,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1149,6 +1252,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1244,6 +1349,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1256,6 +1364,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1270,6 +1382,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1295,6 +1411,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1307,6 +1425,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1402,6 +1522,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1414,6 +1537,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1427,6 +1554,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1452,6 +1583,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1464,6 +1597,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1556,6 +1691,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1569,6 +1707,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1585,6 +1727,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1612,6 +1758,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1625,6 +1773,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1729,6 +1879,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1742,6 +1895,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1757,6 +1914,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1784,6 +1945,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1797,6 +1960,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1901,6 +2066,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1915,6 +2083,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1933,6 +2105,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1962,6 +2138,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1976,6 +2154,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2092,6 +2272,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2106,6 +2289,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2124,6 +2311,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2153,6 +2344,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2167,6 +2360,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2283,6 +2478,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2299,6 +2497,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2316,6 +2518,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2349,6 +2555,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2364,6 +2572,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2480,6 +2690,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2497,6 +2710,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2516,6 +2733,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2551,6 +2772,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2567,6 +2790,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2697,6 +2922,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2714,6 +2942,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2733,6 +2965,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2768,6 +3004,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2784,6 +3022,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2914,6 +3154,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2939,6 +3182,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2964,6 +3211,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3014,6 +3265,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3029,6 +3282,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3147,6 +3402,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3173,6 +3431,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3201,6 +3463,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3253,6 +3519,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3269,6 +3537,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3399,6 +3669,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3425,6 +3698,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3452,6 +3729,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3504,6 +3785,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3520,6 +3803,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3650,6 +3935,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3677,6 +3965,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3707,6 +3999,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3761,6 +4057,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3778,6 +4076,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3920,6 +4220,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3947,6 +4250,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3977,6 +4284,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4031,6 +4342,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4048,6 +4361,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4190,6 +4505,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4216,6 +4534,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4244,6 +4566,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4296,6 +4622,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4312,6 +4640,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4442,6 +4772,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4468,6 +4801,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4496,6 +4833,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4548,6 +4889,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4564,6 +4907,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4694,6 +5039,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4721,6 +5069,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4751,6 +5103,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4805,6 +5161,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4822,6 +5180,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4964,6 +5324,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4991,6 +5354,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5021,6 +5388,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5075,6 +5446,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5092,6 +5465,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5234,6 +5609,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5261,6 +5639,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5291,6 +5673,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5345,6 +5731,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5362,6 +5750,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5504,6 +5894,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5531,6 +5924,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5561,6 +5958,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5615,6 +6016,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5632,6 +6035,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5774,6 +6179,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -5802,6 +6210,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -5830,6 +6242,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -5886,6 +6302,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5904,6 +6322,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6049,6 +6469,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6078,6 +6501,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6107,6 +6534,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6165,6 +6596,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6184,6 +6617,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6336,6 +6771,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6365,6 +6803,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6395,6 +6837,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6453,6 +6899,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6472,6 +6920,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6629,6 +7079,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6659,6 +7112,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6690,6 +7147,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6750,6 +7211,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6770,6 +7233,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6936,6 +7401,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6966,6 +7434,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6997,6 +7469,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7057,6 +7533,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7077,6 +7555,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7243,6 +7723,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7272,6 +7755,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7301,6 +7788,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7359,6 +7850,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7378,6 +7871,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7532,6 +8027,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7561,6 +8059,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7590,6 +8092,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7648,6 +8154,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7667,6 +8175,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7819,6 +8329,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7849,6 +8362,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7880,6 +8397,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7940,6 +8461,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7960,6 +8483,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8126,6 +8651,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8156,6 +8684,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8187,6 +8719,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8247,6 +8783,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8267,6 +8805,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8433,6 +8973,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8463,6 +9006,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8494,6 +9041,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8554,6 +9105,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8574,6 +9127,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8740,6 +9295,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8770,6 +9328,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8801,6 +9363,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8861,6 +9427,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8881,6 +9449,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9047,6 +9617,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9077,6 +9650,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9108,6 +9685,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9168,6 +9749,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9188,6 +9771,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9352,6 +9937,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9382,6 +9970,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9413,6 +10005,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9473,6 +10069,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9493,6 +10091,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9659,6 +10259,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9689,6 +10292,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9720,6 +10327,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9780,6 +10391,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9800,6 +10413,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9966,6 +10581,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9996,6 +10614,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10027,6 +10649,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10087,6 +10713,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10107,6 +10735,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10273,6 +10903,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10288,6 +10921,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10304,6 +10941,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10335,6 +10976,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10349,6 +10992,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10460,6 +11105,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10475,6 +11123,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10491,6 +11143,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10522,6 +11178,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10536,6 +11194,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10647,6 +11307,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10662,6 +11325,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10680,6 +11347,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10711,6 +11382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10725,6 +11398,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10842,6 +11517,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10857,6 +11535,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10877,6 +11559,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10908,6 +11594,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10922,6 +11610,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11049,6 +11739,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11060,6 +11753,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11072,6 +11769,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11095,6 +11796,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11106,6 +11809,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11189,6 +11894,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11200,6 +11908,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11212,6 +11924,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11235,6 +11951,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11246,6 +11964,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11329,6 +12049,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX7-LABEL: flat_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11340,6 +12063,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11354,6 +12081,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11377,6 +12108,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11388,6 +12121,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11480,6 +12215,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11491,6 +12229,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11505,6 +12247,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11528,6 +12274,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11539,6 +12287,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11631,6 +12381,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11643,6 +12396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11656,6 +12413,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11681,6 +12442,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11693,6 +12456,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11785,6 +12550,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11797,6 +12565,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11812,6 +12584,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11837,6 +12613,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11849,6 +12627,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11949,6 +12729,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11961,6 +12744,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11976,6 +12763,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12001,6 +12792,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12013,6 +12806,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12114,6 +12909,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12126,6 +12924,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12143,6 +12945,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12168,6 +12974,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12180,6 +12988,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12289,6 +13099,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12301,6 +13114,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12318,6 +13135,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12343,6 +13164,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12355,6 +13178,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12464,6 +13289,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12479,6 +13307,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12497,6 +13329,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12528,6 +13364,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12542,6 +13380,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12657,6 +13497,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12672,6 +13515,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12692,6 +13539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12723,6 +13574,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12737,6 +13590,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12863,6 +13718,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12878,6 +13736,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12898,6 +13760,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12929,6 +13795,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12943,6 +13811,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13069,6 +13939,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13094,6 +13967,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13119,6 +13996,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13169,6 +14050,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13184,6 +14067,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13302,6 +14187,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13327,6 +14215,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13354,6 +14246,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13404,6 +14300,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13419,6 +14317,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13545,6 +14445,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13570,6 +14473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13597,6 +14504,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13647,6 +14558,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13662,6 +14575,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13789,6 +14704,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13814,6 +14732,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13843,6 +14765,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13893,6 +14819,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13908,6 +14836,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14043,6 +14973,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14068,6 +15001,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14097,6 +15034,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14147,6 +15088,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14162,6 +15105,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14297,6 +15242,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14322,6 +15270,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14349,6 +15301,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14399,6 +15355,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14414,6 +15372,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14540,6 +15500,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14565,6 +15528,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14592,6 +15559,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14642,6 +15613,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14657,6 +15630,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14783,6 +15758,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14808,6 +15786,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14837,6 +15819,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14887,6 +15873,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14902,6 +15890,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15037,6 +16027,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15062,6 +16055,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15091,6 +16088,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15141,6 +16142,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15156,6 +16159,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15291,6 +16296,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15316,6 +16324,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15345,6 +16357,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15395,6 +16411,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15410,6 +16428,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15545,6 +16565,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15570,6 +16593,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15599,6 +16626,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15649,6 +16680,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15664,6 +16697,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15799,6 +16834,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15824,6 +16862,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15853,6 +16895,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15903,6 +16949,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15918,6 +16966,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16053,6 +17103,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16078,6 +17131,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16107,6 +17164,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16157,6 +17218,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16172,6 +17235,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16307,6 +17372,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16332,6 +17400,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16361,6 +17433,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16411,6 +17487,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16426,6 +17504,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16561,6 +17641,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16586,6 +17669,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16615,6 +17702,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16665,6 +17756,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16680,6 +17773,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16815,6 +17910,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16843,6 +17941,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16871,6 +17973,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16927,6 +18033,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16945,6 +18053,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17090,6 +18200,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17118,6 +18231,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17148,6 +18265,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17204,6 +18325,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17222,6 +18345,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17373,6 +18498,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17401,6 +18529,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17431,6 +18563,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17487,6 +18623,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17505,6 +18643,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17659,6 +18799,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17687,6 +18830,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17719,6 +18866,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17775,6 +18926,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17793,6 +18946,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17955,6 +19110,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17983,6 +19141,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18015,6 +19177,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18071,6 +19237,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18089,6 +19257,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18251,6 +19421,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18279,6 +19452,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18309,6 +19486,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18365,6 +19546,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18383,6 +19566,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18536,6 +19721,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18564,6 +19752,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18594,6 +19786,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18650,6 +19846,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18668,6 +19866,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18819,6 +20019,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18847,6 +20050,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18879,6 +20086,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18935,6 +20146,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18953,6 +20166,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19115,6 +20330,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19143,6 +20361,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19175,6 +20397,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19231,6 +20457,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19249,6 +20477,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19411,6 +20641,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19439,6 +20672,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19471,6 +20708,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19527,6 +20768,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19545,6 +20788,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19707,6 +20952,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19735,6 +20983,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19767,6 +21019,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19823,6 +21079,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19841,6 +21099,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20003,6 +21263,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20031,6 +21294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20063,6 +21330,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20119,6 +21390,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20137,6 +21410,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20297,6 +21572,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20325,6 +21603,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20357,6 +21639,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20413,6 +21699,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20431,6 +21719,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20593,6 +21883,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20621,6 +21914,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20653,6 +21950,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20709,6 +22010,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20727,6 +22030,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20889,6 +22194,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20917,6 +22225,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20949,6 +22261,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21005,6 +22321,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21023,6 +22341,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 8a02ad5dfdb7b0..8d14f92d9806e3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_agent_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -238,6 +241,9 @@ define amdgpu_kernel void @global_agent_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -437,6 +443,9 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -652,6 +661,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -876,6 +888,9 @@ define amdgpu_kernel void @global_agent_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1045,6 +1060,9 @@ define amdgpu_kernel void @global_agent_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1215,6 +1233,9 @@ define amdgpu_kernel void @global_agent_release_store(
 ;
 ; GFX7-LABEL: global_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1411,6 +1432,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1606,6 +1630,9 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1766,6 +1793,9 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1952,6 +1982,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2139,6 +2172,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2353,6 +2389,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2567,6 +2606,9 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2770,6 +2812,9 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3003,6 +3048,9 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3236,6 +3284,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3464,6 +3515,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3718,6 +3772,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3973,6 +4030,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4255,6 +4315,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4536,6 +4599,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4791,6 +4857,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5047,6 +5116,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5329,6 +5401,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5611,6 +5686,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5893,6 +5971,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6175,6 +6256,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6457,6 +6541,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6739,6 +6826,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7021,6 +7111,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7303,6 +7396,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7561,6 +7657,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7835,6 +7934,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8120,6 +8222,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8425,6 +8530,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8729,6 +8837,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9007,6 +9118,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9282,6 +9396,9 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9587,6 +9704,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9892,6 +10012,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10197,6 +10320,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10502,6 +10628,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10803,6 +10932,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11108,6 +11240,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11413,6 +11548,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11716,6 +11854,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11914,6 +12055,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12113,6 +12257,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12328,6 +12475,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12552,6 +12702,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12721,6 +12874,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12891,6 +13047,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13087,6 +13246,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13282,6 +13444,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13442,6 +13607,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13628,6 +13796,9 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13815,6 +13986,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14029,6 +14203,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14243,6 +14420,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14446,6 +14626,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14679,6 +14862,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14912,6 +15098,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15140,6 +15329,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15394,6 +15586,9 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15649,6 +15844,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15931,6 +16129,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16212,6 +16413,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16467,6 +16671,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16723,6 +16930,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17005,6 +17215,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17287,6 +17500,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17569,6 +17785,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17851,6 +18070,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18133,6 +18355,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18415,6 +18640,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18697,6 +18925,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18979,6 +19210,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19237,6 +19471,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19512,6 +19749,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19817,6 +20057,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20121,6 +20364,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20399,6 +20645,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20674,6 +20923,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20979,6 +21231,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21284,6 +21539,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21589,6 +21847,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21894,6 +22155,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22195,6 +22459,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22500,6 +22767,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22805,6 +23075,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 14f1734235673a..1069cb6f0135d9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -35,6 +35,9 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ;
 ; GFX7-LABEL: global_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -242,6 +245,9 @@ define amdgpu_kernel void @global_nontemporal_load_1(
 ;
 ; GFX7-LABEL: global_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -506,6 +512,9 @@ define amdgpu_kernel void @global_nontemporal_store_0(
 ;
 ; GFX7-LABEL: global_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -708,6 +717,9 @@ define amdgpu_kernel void @global_nontemporal_store_1(
 ;
 ; GFX7-LABEL: global_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -961,6 +973,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: global_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 33aaeebf658dd6..bf4d77ad61c6b2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -238,6 +241,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -436,6 +442,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -634,6 +643,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -825,6 +837,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -994,6 +1009,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1163,6 +1181,9 @@ define amdgpu_kernel void @global_singlethread_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1332,6 +1353,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1501,6 +1525,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1659,6 +1686,9 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1817,6 +1847,9 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1975,6 +2008,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2133,6 +2169,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2293,6 +2332,9 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2479,6 +2521,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2665,6 +2710,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2853,6 +2901,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3079,6 +3130,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3305,6 +3359,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3531,6 +3588,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3757,6 +3817,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3983,6 +4046,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4209,6 +4275,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4435,6 +4504,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4661,6 +4733,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4887,6 +4962,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5113,6 +5191,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5339,6 +5420,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5565,6 +5649,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5791,6 +5878,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6017,6 +6107,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6246,6 +6339,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6503,6 +6599,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6760,6 +6859,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7017,6 +7119,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7274,6 +7379,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7531,6 +7639,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7788,6 +7899,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8045,6 +8159,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8302,6 +8419,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8559,6 +8679,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8816,6 +8939,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9073,6 +9199,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9330,6 +9459,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9587,6 +9719,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9844,6 +9979,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10101,6 +10239,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10299,6 +10440,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10497,6 +10641,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10695,6 +10842,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10886,6 +11036,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11055,6 +11208,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11224,6 +11380,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11393,6 +11552,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11562,6 +11724,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11720,6 +11885,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11878,6 +12046,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12036,6 +12207,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12194,6 +12368,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12354,6 +12531,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12540,6 +12720,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12726,6 +12909,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12914,6 +13100,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13140,6 +13329,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13366,6 +13558,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13592,6 +13787,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13818,6 +14016,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14044,6 +14245,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14270,6 +14474,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14496,6 +14703,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14722,6 +14932,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14948,6 +15161,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15174,6 +15390,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15400,6 +15619,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15626,6 +15848,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15852,6 +16077,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16078,6 +16306,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16307,6 +16538,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16564,6 +16798,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16821,6 +17058,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17078,6 +17318,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17335,6 +17578,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17592,6 +17838,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17849,6 +18098,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18106,6 +18358,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18363,6 +18618,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18620,6 +18878,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18877,6 +19138,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19134,6 +19398,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19391,6 +19658,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19648,6 +19918,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19905,6 +20178,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index ae5ec082024fd6..ecef93dcd84d90 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_system_unordered_load(
 ;
 ; GFX7-LABEL: global_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -238,6 +241,9 @@ define amdgpu_kernel void @global_system_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -437,6 +443,9 @@ define amdgpu_kernel void @global_system_acquire_load(
 ;
 ; GFX7-LABEL: global_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -654,6 +663,9 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -880,6 +892,9 @@ define amdgpu_kernel void @global_system_unordered_store(
 ;
 ; GFX7-LABEL: global_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1049,6 +1064,9 @@ define amdgpu_kernel void @global_system_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1219,6 +1237,9 @@ define amdgpu_kernel void @global_system_release_store(
 ;
 ; GFX7-LABEL: global_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1417,6 +1438,9 @@ define amdgpu_kernel void @global_system_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1614,6 +1638,9 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1774,6 +1801,9 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1962,6 +1992,9 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2151,6 +2184,9 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2369,6 +2405,9 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2587,6 +2626,9 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2792,6 +2834,9 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3029,6 +3074,9 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3266,6 +3314,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3494,6 +3545,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3750,6 +3804,9 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4007,6 +4064,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4293,6 +4353,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4578,6 +4641,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4835,6 +4901,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5093,6 +5162,9 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5379,6 +5451,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5665,6 +5740,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5951,6 +6029,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6237,6 +6318,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6495,6 +6579,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6772,6 +6859,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7081,6 +7171,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7389,6 +7482,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7669,6 +7765,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7946,6 +8045,9 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8255,6 +8357,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8564,6 +8669,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8873,6 +8981,9 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9182,6 +9293,9 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9487,6 +9601,9 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9796,6 +9913,9 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10105,6 +10225,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10412,6 +10535,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10610,6 +10736,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10809,6 +10938,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11026,6 +11158,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11252,6 +11387,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11421,6 +11559,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11591,6 +11732,9 @@ define amdgpu_kernel void @global_system_one_as_release_store(
 ;
 ; GFX7-LABEL: global_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11789,6 +11933,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11986,6 +12133,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12146,6 +12296,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12334,6 +12487,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12523,6 +12679,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12741,6 +12900,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12959,6 +13121,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13164,6 +13329,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13401,6 +13569,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13638,6 +13809,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13866,6 +14040,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14122,6 +14299,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14379,6 +14559,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14665,6 +14848,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14950,6 +15136,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15207,6 +15396,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15465,6 +15657,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15751,6 +15946,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16037,6 +16235,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16323,6 +16524,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16609,6 +16813,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16895,6 +17102,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17181,6 +17391,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17467,6 +17680,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17753,6 +17969,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18011,6 +18230,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18287,6 +18509,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18574,6 +18799,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18883,6 +19111,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19191,6 +19422,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19471,6 +19705,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19748,6 +19985,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20057,6 +20297,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20366,6 +20609,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20675,6 +20921,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20984,6 +21233,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21289,6 +21541,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21598,6 +21853,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21907,6 +22165,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index d916ff533e77bd..eea71794e549e8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -36,6 +36,9 @@ define amdgpu_kernel void @global_volatile_load_0(
 ;
 ; GFX7-LABEL: global_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -192,6 +195,9 @@ define amdgpu_kernel void @global_volatile_load_1(
 ;
 ; GFX7-LABEL: global_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -389,6 +395,9 @@ define amdgpu_kernel void @global_volatile_store_0(
 ;
 ; GFX7-LABEL: global_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -554,6 +563,9 @@ define amdgpu_kernel void @global_volatile_store_1(
 ;
 ; GFX7-LABEL: global_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -757,6 +769,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -900,6 +915,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index aaa11c0455606f..95bc4ddd0cff74 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -238,6 +241,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -436,6 +442,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -634,6 +643,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -825,6 +837,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -994,6 +1009,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1163,6 +1181,9 @@ define amdgpu_kernel void @global_wavefront_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1332,6 +1353,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1501,6 +1525,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1659,6 +1686,9 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1817,6 +1847,9 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1975,6 +2008,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2133,6 +2169,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2293,6 +2332,9 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2479,6 +2521,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2665,6 +2710,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2853,6 +2901,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3079,6 +3130,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3305,6 +3359,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3531,6 +3588,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3757,6 +3817,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3983,6 +4046,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4209,6 +4275,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4435,6 +4504,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4661,6 +4733,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4887,6 +4962,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5113,6 +5191,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5339,6 +5420,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5565,6 +5649,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5791,6 +5878,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6017,6 +6107,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6246,6 +6339,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6503,6 +6599,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6760,6 +6859,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7017,6 +7119,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7274,6 +7379,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7531,6 +7639,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7788,6 +7899,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8045,6 +8159,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8302,6 +8419,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8559,6 +8679,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8816,6 +8939,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9073,6 +9199,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9330,6 +9459,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9587,6 +9719,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9844,6 +9979,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10101,6 +10239,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10299,6 +10440,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10497,6 +10641,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10695,6 +10842,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10886,6 +11036,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11055,6 +11208,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11224,6 +11380,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11393,6 +11552,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11562,6 +11724,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11720,6 +11885,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11878,6 +12046,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12036,6 +12207,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12194,6 +12368,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12354,6 +12531,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12540,6 +12720,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12726,6 +12909,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12914,6 +13100,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13140,6 +13329,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13366,6 +13558,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13592,6 +13787,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13818,6 +14016,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14044,6 +14245,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14270,6 +14474,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14496,6 +14703,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14722,6 +14932,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14948,6 +15161,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15174,6 +15390,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15400,6 +15619,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15626,6 +15848,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15852,6 +16077,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16078,6 +16306,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16307,6 +16538,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16564,6 +16798,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16821,6 +17058,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17078,6 +17318,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17335,6 +17578,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17592,6 +17838,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17849,6 +18098,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18106,6 +18358,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18363,6 +18618,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18620,6 +18878,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18877,6 +19138,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19134,6 +19398,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19391,6 +19658,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19648,6 +19918,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19905,6 +20178,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 26511f079fa8f8..28c6f255e86a8e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -238,6 +241,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -436,6 +442,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -640,6 +649,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -847,6 +859,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1016,6 +1031,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1186,6 +1204,9 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1374,6 +1395,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1561,6 +1585,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1719,6 +1746,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1888,6 +1918,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2065,6 +2098,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2252,6 +2288,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2440,6 +2479,9 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2632,6 +2674,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2844,6 +2889,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3057,6 +3105,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3283,6 +3334,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3520,6 +3574,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3765,6 +3822,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4020,6 +4080,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4274,6 +4337,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4510,6 +4576,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4747,6 +4816,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5002,6 +5074,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5257,6 +5332,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5512,6 +5590,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5767,6 +5848,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6022,6 +6106,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6277,6 +6364,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6532,6 +6622,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6789,6 +6882,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7046,6 +7142,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7309,6 +7408,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7585,6 +7687,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7868,6 +7973,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8150,6 +8258,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8414,6 +8525,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8677,6 +8791,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8960,6 +9077,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9243,6 +9363,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9526,6 +9649,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9809,6 +9935,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10090,6 +10219,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10373,6 +10505,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10656,6 +10791,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10938,6 +11076,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11136,6 +11277,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11334,6 +11478,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11537,6 +11684,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11741,6 +11891,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11910,6 +12063,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12079,6 +12235,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12259,6 +12418,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12439,6 +12601,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12597,6 +12762,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12765,6 +12933,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12934,6 +13105,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13113,6 +13287,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13294,6 +13471,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13485,6 +13665,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13689,6 +13872,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13895,6 +14081,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14121,6 +14310,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14357,6 +14549,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14594,6 +14789,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14841,6 +15039,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15088,6 +15289,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15324,6 +15528,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15560,6 +15767,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15807,6 +16017,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16054,6 +16267,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16301,6 +16517,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16548,6 +16767,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16795,6 +17017,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17042,6 +17267,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17289,6 +17517,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17539,6 +17770,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17796,6 +18030,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18058,6 +18295,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18326,6 +18566,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18601,6 +18844,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18876,6 +19122,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19140,6 +19389,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19402,6 +19654,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19677,6 +19932,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19952,6 +20210,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20227,6 +20488,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20502,6 +20766,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20775,6 +21042,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21050,6 +21320,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21325,6 +21598,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index fce60ff12aed3d..24598bcbea5bba 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -37,6 +37,9 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ;
 ; GFX7-LABEL: local_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -240,6 +243,9 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ;
 ; GFX7-LABEL: local_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -895,6 +901,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: local_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index a8f7051bd5050c..455834572a59ac 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -33,6 +33,9 @@ define amdgpu_kernel void @local_volatile_load_0(
 ;
 ; GFX7-LABEL: local_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -180,6 +183,9 @@ define amdgpu_kernel void @local_volatile_load_1(
 ;
 ; GFX7-LABEL: local_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index c3599c87985bec..55c40ee491100e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -37,7 +37,10 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX7-LABEL: private_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -69,7 +72,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -110,7 +113,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -126,7 +129,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -248,7 +251,10 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX7-LABEL: private_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -266,7 +272,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -284,7 +290,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -329,7 +335,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -350,7 +356,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -507,7 +513,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX7-LABEL: private_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -522,7 +528,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -538,7 +544,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -573,7 +579,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -589,7 +595,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -705,7 +711,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX7-LABEL: private_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -722,7 +728,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -738,7 +744,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -775,7 +781,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -794,7 +800,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -945,7 +951,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: private_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -961,7 +970,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -977,7 +986,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1018,7 +1027,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1034,7 +1043,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 9146f175eefcd1..cdb1b463ac321e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -37,7 +37,10 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX7-LABEL: private_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -69,7 +72,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -198,7 +201,10 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX7-LABEL: private_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -216,7 +222,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -234,7 +240,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -386,7 +392,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX7-LABEL: private_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -402,7 +408,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -419,7 +425,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -549,7 +555,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX7-LABEL: private_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -567,7 +573,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -584,7 +590,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index d070dc3b770f81..1fd311f225db4a 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -203,13 +203,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
 attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
 attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index f62f1d57aec8e4..9577f2a932f966 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -399,26 +399,26 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" }
 attributes #18 = { "amdgpu-waves-per-eu"="9,10" }
 attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
index c1d647c5d3b907..9758afa3b49912 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
@@ -19,5 +19,5 @@ define void @hoge()  {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 4420833029d466..8792e60bb0ca16 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -191,11 +191,11 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 !1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
 
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 59036c64c8afcc..a51b0128a3a4a4 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -7,7 +7,7 @@
 define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 {
 ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 s0, s0, s13
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_load_dword s4, s[6:7], 0x2
 ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index cca7b49996ff3b..183cdb2e1f8627 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -80,8 +80,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ;.
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
 ; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
index 049db01badacf9..2ab48479fa1b08 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@@ -31,6 +31,6 @@ define amdgpu_kernel void @kernel1() #1 {
 
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
index c9387f196dff96..cc58d34a8b255b 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
@@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel2() #0 {
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
index 7183da2c5efc36..33298bde89b975 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 {
 
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
index 6ed04cf63d20be..6fe85cd7c9e654 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @kernel2() #2 {
 
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
index d5ba2fd617c6ef..f6ab402e012325 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -52,8 +52,8 @@ attributes #0 = { nounwind }
 attributes #1 = { "uniform-work-group-size"="false" }
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
index 7f0dfeaf75c808..37e3376ef9d500 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
@@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 {
 attributes #0 = { nounwind readnone }
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
index 8616c73ad51c13..7f83686bc756e8 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
@@ -61,6 +61,6 @@ define amdgpu_kernel void @kernel3() #0 {
 
 attributes #0 = { "uniform-work-group-size"="false" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index a827ebe96cfcf4..43a948b6c6ab27 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -11,7 +11,7 @@
 define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_add_u32 s0, s0, s13
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0

>From 7eacaf22e1de95f1c45214c228b78ebe7ad11827 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Fri, 16 Aug 2024 10:43:34 -0700
Subject: [PATCH 02/13] (1) Use getCalledFunction instead of getCalledOperand
 (2) other minor code change based on reviews (3) fix test files.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  22 +-
 .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll  | 368 ++++++++++++--
 .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll  | 390 ++++++++++++--
 .../AMDGPU/GlobalISel/extractelement.ll       |  69 +--
 ...licit-kernarg-backend-usage-global-isel.ll |  50 +-
 .../GlobalISel/insertelement-stack-lower.ll   |   2 +-
 .../AMDGPU/GlobalISel/lds-global-value.ll     |   5 +-
 .../GlobalISel/llvm.amdgcn.if.break.i64.ll    |   3 +
 .../GlobalISel/llvm.amdgcn.trig.preop.ll      |  24 +
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll |  33 ++
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  33 ++
 .../abi-attribute-hints-undefined-behavior.ll |  18 +-
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll     |  41 +-
 llvm/test/CodeGen/AMDGPU/always-uniform.ll    |   3 +
 ...amdgpu-codegenprepare-fold-binop-select.ll |  10 +-
 .../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll   |   4 +-
 .../annotate-kernel-features-hsa-call.ll      |   2 +-
 .../AMDGPU/annotate-kernel-features-hsa.ll    |  30 +-
 .../attr-amdgpu-flat-work-group-size.ll       |   4 +-
 .../CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll    |   6 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        |   4 +-
 .../attributor-flatscratchinit-globalisel.ll  |   2 +-
 .../AMDGPU/attributor-flatscratchinit.ll      | 257 +++++-----
 .../AMDGPU/call-graph-register-usage.ll       |   8 +-
 llvm/test/CodeGen/AMDGPU/code-object-v3.ll    |  12 +-
 .../CodeGen/AMDGPU/combine-reg-or-const.ll    |   3 +
 ...dagcomb-extract-vec-elt-different-sizes.ll |   2 +
 ...cannot-create-empty-or-backward-segment.ll |   2 +-
 .../expand-scalar-carry-out-select-user.ll    |   3 +
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |  66 +++
 .../fast-unaligned-load-store.global.ll       |  20 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     | 236 ++++++++-
 .../flat-for-global-subtarget-feature.ll      |   7 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll  | 105 ++--
 .../AMDGPU/fmul-2-combine-multi-use.ll        |  48 ++
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |  60 +++
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |   3 +
 llvm/test/CodeGen/AMDGPU/fneg.f16.ll          |  62 ++-
 llvm/test/CodeGen/AMDGPU/half.ll              | 231 +++++++++
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |   9 +-
 llvm/test/CodeGen/AMDGPU/hsa.ll               |   4 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |  50 +-
 .../indirect-call-set-from-other-function.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll   |  36 +-
 .../AMDGPU/insert_vector_elt.v2bf16.ll        |  58 ++-
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 214 +++++++-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |   2 +-
 llvm/test/CodeGen/AMDGPU/kernarg-size.ll      |   2 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  | 248 ++++++---
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |  37 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |  70 ++-
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    | 114 ++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 126 ++++-
 llvm/test/CodeGen/AMDGPU/load-constant-f64.ll |   6 +
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 127 ++++-
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |  85 +++-
 llvm/test/CodeGen/AMDGPU/load-constant-i64.ll |  18 +
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  | 164 +++++-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   | 129 ++++-
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |  83 ++-
 llvm/test/CodeGen/AMDGPU/load-select-ptr.ll   |   3 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |  16 +-
 .../AMDGPU/lower-module-lds-via-table.ll      |  15 +-
 .../CodeGen/AMDGPU/mad24-get-global-id.ll     |   2 +-
 .../match-perm-extract-vector-elt-bug.ll      |   8 +-
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    |  20 +-
 llvm/test/CodeGen/AMDGPU/min.ll               | 211 +++++++-
 llvm/test/CodeGen/AMDGPU/pack.v2f16.ll        |  21 +
 llvm/test/CodeGen/AMDGPU/pack.v2i16.ll        |  18 +
 ...al-regcopy-and-spill-missed-at-regalloc.ll |  61 ++-
 .../AMDGPU/propagate-flat-work-group-size.ll  |   2 +-
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  |   4 +-
 .../AMDGPU/remove-no-kernel-id-attribute.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/sad.ll               |  68 ++-
 .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll  |  16 +
 .../scc-clobbered-sgpr-to-vmem-spill.ll       | 474 +++++++++---------
 llvm/test/CodeGen/AMDGPU/shift-i128.ll        |  24 +-
 .../CodeGen/AMDGPU/simple-indirect-call-2.ll  |  10 +-
 llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll    |  70 ++-
 .../CodeGen/AMDGPU/spill-vector-superclass.ll |   8 +-
 llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll |   6 +
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll   |   9 +-
 llvm/test/CodeGen/AMDGPU/trap-abis.ll         |  16 +-
 llvm/test/CodeGen/AMDGPU/udiv.ll              |  45 ++
 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll    |  91 +++-
 ...ine-function-info-long-branch-reg-debug.ll |   7 +-
 .../machine-function-info-long-branch-reg.ll  |   7 +-
 89 files changed, 4176 insertions(+), 900 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 76eba1aa9ffcd9..3ef0694046553b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -440,17 +440,11 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       return;
     }
 
-    bool HasAllocaOrASCast = false;
-    for (BasicBlock &BB : *F) {
-      for (Instruction &I : BB) {
-        if (isa<AllocaInst>(I) || isa<AddrSpaceCastInst>(I)) {
-          HasAllocaOrASCast = true;
-          removeAssumedBits(FLAT_SCRATCH_INIT);
-          break;
-        }
+    for (Instruction &I : instructions(F)) {
+      if (isa<AllocaInst>(I) || isa<AddrSpaceCastInst>(I)) {
+        removeAssumedBits(FLAT_SCRATCH_INIT);
+        return;
       }
-      if (HasAllocaOrASCast)
-        break;
     }
   }
 
@@ -707,13 +701,12 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     // no-flat-scratch-init.
     auto CheckForNoFlatScratchInit = [&](Instruction &I) {
       const auto &CB = cast<CallBase>(I);
-      const Value *CalleeOp = CB.getCalledOperand();
-      const Function *Callee = dyn_cast<Function>(CalleeOp);
+      const Function *Callee = CB.getCalledFunction();
       if (!Callee) // indirect call
         return CB.isInlineAsm();
 
       if (Callee->isIntrinsic())
-        return true;
+        return Callee->getIntrinsicID() != Intrinsic::amdgcn_addrspacecast_nonnull;
 
       const auto *CalleeInfo = A.getAAFor<AAAMDAttributes>(
           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
@@ -722,7 +715,8 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
 
     bool UsedAssumedInformation = false;
     // If any callee is false (i.e. need FlatScratchInit),
-    // checkForAllCallLikeInstructions returns false
+    // checkForAllCallLikeInstructions returns false, in which case this
+    // function returns true.
     return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
                                               UsedAssumedInformation);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index 705bcbddf227a6..cb64c25b5f080e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -99,11 +105,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -114,11 +123,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -291,6 +303,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -306,6 +321,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -365,6 +383,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -382,6 +403,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -444,6 +468,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -461,6 +488,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -523,6 +553,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -535,6 +568,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -585,6 +621,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -599,6 +638,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -652,6 +694,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -666,6 +711,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -720,7 +768,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 42
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -728,6 +778,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -742,7 +793,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 42
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -750,6 +803,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -814,6 +868,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -831,6 +888,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -890,6 +950,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -905,6 +968,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -920,6 +986,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -934,6 +1002,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -970,6 +1042,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -987,6 +1062,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -1004,6 +1082,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1018,6 +1098,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1057,6 +1141,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -1074,6 +1161,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -1091,6 +1181,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1105,6 +1197,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1144,6 +1240,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1156,6 +1255,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1168,6 +1270,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1179,6 +1283,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1211,6 +1319,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -1225,6 +1336,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -1239,6 +1353,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1250,6 +1366,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1285,6 +1405,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -1299,6 +1422,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -1313,6 +1439,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1324,6 +1452,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1360,7 +1492,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 42
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1368,6 +1502,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -1382,7 +1517,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 42
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1390,6 +1527,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -1404,6 +1542,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1422,6 +1562,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 42
@@ -1478,6 +1622,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1495,6 +1642,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1512,6 +1662,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -1525,6 +1677,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1571,10 +1727,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1592,10 +1751,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1613,7 +1775,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_ret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1628,6 +1792,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1666,12 +1834,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1689,12 +1860,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1712,7 +1886,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1727,6 +1903,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1768,10 +1948,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1781,10 +1964,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1794,7 +1980,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -1806,6 +1994,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1840,12 +2032,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1855,12 +2050,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1870,7 +2068,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -1882,6 +2082,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1919,12 +2123,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1934,12 +2141,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1949,7 +2159,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -1961,6 +2173,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1999,6 +2215,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2025,6 +2244,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2051,12 +2273,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2070,6 +2294,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2128,6 +2356,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2146,6 +2377,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2164,12 +2398,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2178,6 +2414,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2231,8 +2471,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v2 offset:8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    flat_store_dword v[0:1], v3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2249,8 +2492,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v2 offset:8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2326,7 +2572,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -2342,7 +2591,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2410,7 +2662,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -2426,7 +2681,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2612,10 +2870,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2628,10 +2889,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2691,12 +2955,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2709,12 +2976,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2775,12 +3045,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_dec_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2793,12 +3066,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_dec_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2859,10 +3135,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2872,10 +3151,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2926,12 +3208,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2941,12 +3226,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2998,12 +3286,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3013,12 +3304,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_dec_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3071,6 +3365,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3094,6 +3391,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3170,6 +3470,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3188,6 +3491,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3258,7 +3564,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v4, s3
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v3, s2
 ; CI-NEXT:    flat_store_dword v[3:4], v0
@@ -3277,7 +3586,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    flat_store_dword v[3:4], v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index b3a7e65f771c43..00ff2d7a35d561 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -99,11 +105,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -114,11 +123,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -291,6 +303,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -306,6 +321,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -365,6 +383,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -382,6 +403,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -444,6 +468,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -461,6 +488,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -523,6 +553,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_inc_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -535,6 +568,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_inc_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -585,6 +621,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -599,6 +638,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -652,6 +694,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_inc_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -666,6 +711,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_inc_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -720,7 +768,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 42
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -728,6 +778,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -742,7 +793,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 42
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -750,6 +803,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -814,6 +868,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -831,6 +888,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -896,8 +956,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v2 offset:8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    flat_store_dword v[0:1], v3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -914,8 +977,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v2 offset:8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -991,7 +1057,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -1007,7 +1076,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1075,7 +1147,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -1091,7 +1166,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1277,10 +1355,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1293,10 +1374,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1356,12 +1440,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1374,12 +1461,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1440,12 +1530,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1458,12 +1551,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1524,10 +1620,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1537,10 +1636,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1591,12 +1693,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1606,12 +1711,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1663,12 +1771,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1678,12 +1789,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1736,6 +1850,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1759,6 +1876,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1835,6 +1955,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1853,6 +1976,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1916,6 +2042,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -1931,6 +2060,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -1946,6 +2078,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1960,6 +2094,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1996,6 +2134,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -2013,6 +2154,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -2030,6 +2174,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2044,6 +2190,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2083,6 +2233,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -2100,6 +2253,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -2117,6 +2273,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2131,6 +2289,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2170,6 +2332,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2182,6 +2347,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2194,6 +2362,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2205,6 +2375,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2237,6 +2411,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -2251,6 +2428,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -2265,6 +2445,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2276,6 +2458,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2311,6 +2497,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -2325,6 +2514,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -2339,6 +2531,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2350,6 +2544,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2386,7 +2584,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 42
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2394,6 +2594,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -2408,7 +2609,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 42
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2416,6 +2619,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -2430,6 +2634,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2448,6 +2654,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 42
@@ -2504,6 +2714,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2521,6 +2734,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2538,6 +2754,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -2551,6 +2769,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2604,7 +2826,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v4, s3
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v3, s2
 ; CI-NEXT:    flat_store_dword v[3:4], v0
@@ -2623,7 +2848,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    flat_store_dword v[3:4], v0
@@ -2696,10 +2924,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2717,10 +2948,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2738,7 +2972,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_ret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2753,6 +2989,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2791,12 +3031,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2814,12 +3057,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2837,7 +3083,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2852,6 +3100,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2893,12 +3145,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2916,12 +3171,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2939,7 +3197,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2954,6 +3214,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2995,10 +3259,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3008,10 +3275,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3021,7 +3291,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3033,6 +3305,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3067,12 +3343,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3082,12 +3361,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3097,7 +3379,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3109,6 +3393,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3146,12 +3434,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3161,12 +3452,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3176,7 +3470,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3188,6 +3484,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3226,6 +3526,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3252,6 +3555,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3278,12 +3584,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3297,6 +3605,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3355,6 +3667,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3373,6 +3688,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3391,12 +3709,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3405,6 +3725,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3455,6 +3779,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
@@ -3462,6 +3787,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; CI-NEXT:    ds_inc_rtn_u32 v3, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -3475,6 +3802,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
@@ -3482,6 +3810,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; VI-NEXT:    ds_inc_rtn_u32 v3, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 34efb089b72bf1..868b530e42a219 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3037,7 +3037,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 10
+; GPRIDX-NEXT:     user_sgpr_count = 12
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3052,7 +3052,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -3069,7 +3069,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 13
+; GPRIDX-NEXT:     wavefront_sgpr_count = 15
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3117,7 +3117,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -3128,7 +3128,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 10
+; MOVREL-NEXT:     user_sgpr_count = 12
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3143,7 +3143,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -3160,7 +3160,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 9
+; MOVREL-NEXT:     wavefront_sgpr_count = 22
 ; MOVREL-NEXT:     workitem_vgpr_count = 4
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -3178,21 +3178,24 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; MOVREL-NEXT:    s_load_dword s8, s[6:7], 0x8
+; MOVREL-NEXT:    s_add_i32 s10, s10, s15
+; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; MOVREL-NEXT:    s_mov_b32 s4, 0
 ; MOVREL-NEXT:    s_mov_b32 s5, 0x40080000
-; MOVREL-NEXT:    s_mov_b32 s2, 0
-; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
 ; MOVREL-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
+; MOVREL-NEXT:    s_mov_b32 s2, 0
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 3
+; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], 4.0, s[4:5]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 4
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
+; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
 ; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -3220,7 +3223,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 10
+; GFX10-NEXT:     user_sgpr_count = 12
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3235,7 +3238,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4054,7 +4057,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 10
+; GPRIDX-NEXT:     user_sgpr_count = 12
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4069,7 +4072,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4086,7 +4089,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 12
+; GPRIDX-NEXT:     wavefront_sgpr_count = 14
 ; GPRIDX-NEXT:     workitem_vgpr_count = 2
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4127,7 +4130,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 0
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4138,7 +4141,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 10
+; MOVREL-NEXT:     user_sgpr_count = 12
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4153,7 +4156,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4170,7 +4173,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 8
+; MOVREL-NEXT:     wavefront_sgpr_count = 22
 ; MOVREL-NEXT:     workitem_vgpr_count = 3
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -4188,6 +4191,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; MOVREL-NEXT:    s_add_i32 s10, s10, s15
+; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s2, 1
 ; MOVREL-NEXT:    s_cselect_b32 s3, 2.0, 1.0
@@ -4223,7 +4229,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 10
+; GFX10-NEXT:     user_sgpr_count = 12
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4238,7 +4244,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4401,7 +4407,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 10
+; GPRIDX-NEXT:     user_sgpr_count = 12
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4416,7 +4422,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4433,7 +4439,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 13
+; GPRIDX-NEXT:     wavefront_sgpr_count = 15
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4477,7 +4483,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4488,7 +4494,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 10
+; MOVREL-NEXT:     user_sgpr_count = 12
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4503,7 +4509,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4520,7 +4526,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 9
+; MOVREL-NEXT:     wavefront_sgpr_count = 22
 ; MOVREL-NEXT:     workitem_vgpr_count = 4
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -4538,10 +4544,12 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dword s8, s[6:7], 0x8
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; MOVREL-NEXT:    s_add_i32 s10, s10, s15
+; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; MOVREL-NEXT:    s_mov_b32 s2, 0
-; MOVREL-NEXT:    s_mov_b32 s3, 0x40080000
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
+; MOVREL-NEXT:    s_mov_b32 s3, 0x40080000
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
@@ -4549,6 +4557,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
+; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
 ; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -4576,7 +4585,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 10
+; GFX10-NEXT:     user_sgpr_count = 12
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4591,7 +4600,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 80e9ae33d6d45e..70b889389ff998 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -10,11 +10,11 @@
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
-; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_mov_b32 s4, s0
 ; GFX8V4-NEXT:    s_mov_b32 s5, s3
@@ -38,11 +38,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX8V5-LABEL: addrspacecast:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_mov_b32 s4, s0
 ; GFX8V5-NEXT:    s_mov_b32 s5, s2
@@ -65,9 +65,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
@@ -91,9 +91,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
@@ -127,6 +127,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
@@ -140,6 +143,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xcc
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
@@ -183,6 +189,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
@@ -196,6 +205,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
@@ -279,7 +291,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
 ; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
@@ -305,6 +320,9 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 4fcde0f2fc7cf1..7aa3b5bb109903 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[20:23], s[6:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[24:25], s[6:7], 0x10
-; GCN-NEXT:    s_add_u32 s0, s0, s13
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index c3938e673a6da6..6ed2df430998f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,13 +11,16 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4
 ; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-NEXT:    ds_read_b32 v2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v3, 9
+; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_add_u32 s0, s0, 4
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_mov_b32_e32 v3, 9
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x200
 ; CHECK-NEXT:    ds_write_b32 v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
index d7a82b415ff06c..3a90c3ee908033 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -6,6 +6,9 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xa
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 0
 ; GCN-NEXT:    s_cselect_b32 s2, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index f3654fea486e0c..96fd14f52d13b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -42,6 +42,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -59,6 +62,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -76,6 +82,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -85,6 +93,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ;
 ; GFX10-LABEL: s_trig_preop_f64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x8
@@ -113,6 +125,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; CI-LABEL: s_trig_preop_f64_imm:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; CI-NEXT:    s_add_u32 s0, s0, 4
@@ -128,6 +143,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; VI-LABEL: s_trig_preop_f64_imm:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; VI-NEXT:    s_add_u32 s0, s0, 4
@@ -143,6 +161,8 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; GFX9-LABEL: s_trig_preop_f64_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
@@ -151,6 +171,10 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ;
 ; GFX10-LABEL: s_trig_preop_f64_imm:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 7d7f450e590faa..f0ec0d101f5bec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -7,6 +7,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s8, s5, 31
 ; GFX8-NEXT:    s_add_i32 s0, s5, s8
@@ -145,6 +148,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
 ; GFX8-LABEL: sdivrem_i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
@@ -616,6 +622,9 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v2i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s2, s10, 31
@@ -845,6 +854,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v4i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1271,6 +1283,9 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: sdivrem_v2i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2188,6 +2203,9 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
 ; GFX8-LABEL: sdiv_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x80008
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -2333,6 +2351,9 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_v2i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s2, 0x80010
 ; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
@@ -2597,6 +2618,9 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
 ; GFX8-LABEL: sdiv_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x100010
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -2742,6 +2766,9 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-LABEL: sdivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s3
 ; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
@@ -3003,6 +3030,9 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
 ; GFX8-LABEL: sdivrem_i3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x30008
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -3154,6 +3184,9 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_i27:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s5, 0x1b0000
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 5aef6679347094..2be04ace99e36d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -7,6 +7,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: udivrem_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s5
@@ -112,6 +115,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
 ; GFX8-LABEL: udivrem_i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s11
@@ -522,6 +528,9 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: udivrem_v2i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s10
@@ -685,6 +694,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: udivrem_v4i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -979,7 +991,10 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: udivrem_v2i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
 ; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x20
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
@@ -1772,6 +1787,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
 ; GFX8-LABEL: udiv_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x80008
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
@@ -1885,6 +1903,9 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s2, s0, 0x80010
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
@@ -2081,6 +2102,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
 ; GFX8-LABEL: udiv_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
@@ -2193,7 +2217,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-LABEL: udivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s2, s1, 0xffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
@@ -2387,6 +2414,9 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
 ; GFX8-LABEL: udivrem_i3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x30008
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
@@ -2505,6 +2535,9 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: udivrem_i27:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0x7ffffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e53653408feb40..b8ffa4f14c3e5c 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -135,6 +135,9 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr)
 ; FIXEDABI-LABEL: marked_kernel_use_workitem_id:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v4, s1
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s0
@@ -181,16 +184,19 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
 ; FIXEDABI-LABEL: marked_kernel_use_workgroup_id:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s6
+; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v1, s1
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s7
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s9
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s10
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-NEXT:    s_endpgm
@@ -238,6 +244,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
 ; FIXEDABI:       ; %bb.0:
+; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; FIXEDABI-NEXT:    s_add_u32 s0, s4, 8
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_addc_u32 s1, s5, 0
@@ -261,7 +270,10 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
 define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
 ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
 ; FIXEDABI:       ; %bb.0:
+; FIXEDABI-NEXT:    s_add_i32 s4, s4, s9
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v0, 0
+; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v1, 0
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 7336543b41cbc8..9534561e6e2805 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -63,23 +63,16 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
 
 ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
 
-; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
-; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
-
-; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
-; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
-; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
-
-; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
-; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
-
-; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; GFX9: s_cmp_lg_u32 [[PTR]], -1
-; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
-; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
-
-; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
+; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
+; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
+; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
+; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
+; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
 
 ; HSA:  .amdhsa_user_sgpr_private_segment_buffer 1
 ; HSA:  .amdhsa_user_sgpr_dispatch_ptr 0
@@ -259,8 +252,11 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
 
 ; FIXME: Shouldn't need to enable queue ptr
 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
-; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
+; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
+; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
@@ -281,7 +277,12 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
 
 ; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
 
-; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
+; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
+; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index 0a461f9ee6c968..e0c69706bad79c 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -8,8 +8,10 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
 ; GCN-LABEL: readfirstlane_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s0, s0, s4
@@ -18,6 +20,7 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
 ; GCN-NEXT:    s_add_u32 s0, s2, 40
 ; GCN-NEXT:    s_addc_u32 s1, s3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index de318e7ae31a5b..89c5303e0e81d1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -393,11 +393,15 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; GCN-LABEL: select_add_lhs_const_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x83
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x80
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s0, 0
-; GCN-NEXT:    s_movk_i32 s0, 0x80
-; GCN-NEXT:    s_cselect_b32 s0, s0, 0x83
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    flat_store_short v[0:1], v0
 ; GCN-NEXT:    s_endpgm
   %select = select i1 %cond, i16 5, i16 8
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index a6d8c6f41eee59..54a800ecee9f17 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
 
 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE:  NumSgprs: 77
-; TRAP-HANDLER-DISABLE: NumSgprs: 92
+; TRAP-HANDLER-ENABLE:  NumSgprs: 83
+; TRAP-HANDLER-DISABLE: NumSgprs: 98
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
     ptr addrspace(1) %out0, i32 %in0,
     ptr addrspace(1) %out1, i32 %in1,
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 23294eacbe6cb3..ab2e28e5f5cbb5 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -751,7 +751,7 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @use_dispatch_ptr_ret_type()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index e30e013e3e3d8f..ac5458f56f08b8 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -478,11 +478,16 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
 
 ; No-op addrspacecast should not use queue ptr
 define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #1 {
-; HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
-; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
-; HSA-NEXT:    ret void
+; AKF_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
+; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
+; AKF_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; AKF_HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
+; AKF_HSA-NEXT:    ret void
+;
+; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[PTR]], align 4
+; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
@@ -490,11 +495,16 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 }
 
 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #1 {
-; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
-; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
-; HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
-; HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
-; HSA-NEXT:    ret void
+; AKF_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
+; AKF_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
+; AKF_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
+; AKF_HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
+; AKF_HSA-NEXT:    ret void
+;
+; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR_HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4
+; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   %ld = load volatile i32, ptr %stof
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index fc13b86566f76c..22cc5af30da662 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -35,9 +35,9 @@ entry:
 attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 
 ; CHECK-LABEL: {{^}}min_1024_max_1024
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 2{{$}}
+; CHECK: NumSGPRsForWavesPerEU: 24{{$}}
 ; CHECK: NumVGPRsForWavesPerEU: 43
 @var = addrspace(1) global float 0.0
 define amdgpu_kernel void @min_1024_max_1024() #3 {
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index ed045107d354dc..3ddf8be052e4ab 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -4,8 +4,8 @@
 
 ; ALL-LABEL: {{^}}max_10_sgprs:
 
-; ALL: SGPRBlocks: 1
-; ALL: NumSGPRsForWavesPerEU: 10
+; ALL: SGPRBlocks: 2
+; ALL: NumSGPRsForWavesPerEU: 22
 define amdgpu_kernel void @max_10_sgprs() #0 {
   %one = load volatile i32, ptr addrspace(4) undef
   %two = load volatile i32, ptr addrspace(4) undef
@@ -125,7 +125,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1
 declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1
 declare ptr addrspace(4) @llvm.amdgcn.queue.ptr() #1
 
-attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
+attributes #0 = { nounwind "amdgpu-num-sgpr"="18" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
 attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 14519f5a5e77c0..26dc3e13a72cf0 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 
 ; Exactly 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 3
 ; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 20
+; CHECK: NumSGPRsForWavesPerEU: 26
 ; CHECK: NumVGPRsForWavesPerEU: 24
 define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, ptr addrspace(1) @var
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index ce5a3eedb5ebb1..f9f48aacfc2bf7 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -1,5 +1,5 @@
 ; Test the generation of the attribute amdgpu-no-flat-scratch-init
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator | FileCheck -check-prefixes=GFX10 %s
 
 ;; tests of alloca
 
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index c0d700cc37464b..6d9da9281211cb 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -1,6 +1,6 @@
 ; Test the generation of the attribute amdgpu-no-flat-scratch-init
-; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
 
 ;; tests of alloca
 
@@ -200,10 +200,10 @@ define amdgpu_kernel void @call_with_alloca_cc_kernel() {
 
 define void @call_both_with_and_without_alloca() {
 ; GFX9-LABEL: define void @call_both_with_and_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
   call void @with_alloca()
   call void @without_alloca()
   ret void
@@ -211,10 +211,10 @@ define void @call_both_with_and_without_alloca() {
 
 define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
   call void @with_alloca()
   call void @without_alloca()
   ret void
@@ -235,7 +235,7 @@ define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
 ; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
   call void @call_without_alloca()
   ret void
 }
@@ -262,10 +262,10 @@ define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
 
 define void @with_alloca_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -274,10 +274,10 @@ define void @with_alloca_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -310,10 +310,10 @@ define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -322,10 +322,10 @@ define void @with_alloca_call_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -360,30 +360,30 @@ define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -391,10 +391,10 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 
 define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -402,30 +402,30 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -433,10 +433,10 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 
 define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -444,30 +444,30 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -475,10 +475,10 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 
 define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -486,20 +486,20 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp
 
 define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI5:[0-9]+]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI6:[0-9]+]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
@@ -528,30 +528,30 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add
 
 define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -559,10 +559,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -570,50 +570,50 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
 
 define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -621,10 +621,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp
 
 define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -632,70 +632,78 @@ define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacec
 
 define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
+
+
+
+
+
+
+
+
 define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -704,10 +712,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(
 
 define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -716,10 +724,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_
 
 define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -728,10 +736,10 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5)
 
 define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -764,10 +772,10 @@ define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kern
 
 define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -775,10 +783,10 @@ define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -834,10 +842,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI7:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI7:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) undef
   ret void
@@ -856,59 +864,84 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 
 define void @call_use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI7:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI7:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-;GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
+; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; GFX9:  attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]]  = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 0d66ea55a04378..51fff3444324fa 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -33,8 +33,8 @@ define void @indirect_use_vcc() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
-; CI: ; NumSgprs: 36
-; VI-NOBUG: ; NumSgprs: 36
+; CI: ; NumSgprs: 38
+; VI-NOBUG: ; NumSgprs: 40
 ; VI-BUG: ; NumSgprs: 96
 ; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
@@ -121,8 +121,8 @@ define void @indirect_use_80_sgpr() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
-; CI: ; NumSgprs: 82
-; VI-NOBUG: ; NumSgprs: 82
+; CI: ; NumSgprs: 84
+; VI-NOBUG: ; NumSgprs: 86
 ; VI-BUG: ; NumSgprs: 96
 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
   call void @indirect_use_80_sgpr()
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 3035a8579c8a6d..48ad2fe687804c 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -12,13 +12,13 @@
 ; OSABI-AMDHSA-ASM: .section .rodata,"a"
 ; OSABI-AMDHSA-ASM: .p2align 6
 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd
-; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
+; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 12
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 1
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
@@ -31,13 +31,13 @@
 ; OSABI-AMDHSA-ASM: .section .rodata,"a"
 ; OSABI-AMDHSA-ASM: .p2align 6
 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub
-; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
+; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 12
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 1
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index 5fbcd0bf669995..c97d3338006022 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -5,6 +5,9 @@
 define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
 ; CHECK-LABEL: _Z11test_kernelPii:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 3
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index 297fe7618672e6..f586f6d16e0efd 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -6,6 +6,8 @@
 define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) {
 ; CHECK-LABEL: eggs:
 ; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x8
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 85ed2914b8c7f5..39328d706ba260 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dword s14, s[6:7], 0x4
-; CHECK-NEXT:    s_add_u32 s24, s24, s13
+; CHECK-NEXT:    s_add_u32 s24, s24, s15
 ; CHECK-NEXT:    s_addc_u32 s25, s25, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp1_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 54fb1dc5c05274..05a245cd3443c4 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -94,6 +94,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
 ; GFX7-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_add_i32 s0, s2, s2
 ; GFX7-NEXT:    s_cmp_lt_u32 s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 21799ab79b8396..7fde702bd49ac8 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -13,6 +13,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -25,6 +28,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -68,6 +74,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -80,6 +89,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -122,6 +134,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -134,6 +149,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -175,6 +193,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; CI-LABEL: s_fabs_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
@@ -188,6 +209,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; VI-LABEL: s_fabs_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
@@ -232,6 +256,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
 ; CI-LABEL: fabs_fold_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -249,6 +276,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
@@ -295,6 +325,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -309,6 +342,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -358,6 +394,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -370,6 +409,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -415,6 +457,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -441,6 +486,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -499,9 +547,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v0, v[0:1]
 ; CI-NEXT:    s_lshr_b32 s2, s4, 16
@@ -527,9 +578,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -589,6 +643,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -612,6 +669,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -682,6 +742,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -700,6 +763,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 7252c69cb1cf75..4df6b8d0669158 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -74,6 +74,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -90,6 +93,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -220,8 +226,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -231,6 +239,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v2
 ; GFX7-ALIGNED-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s0, s0, 3
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v3
@@ -247,6 +256,9 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -359,6 +371,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -369,6 +384,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index f0ce96af90649d..6482749bd2fb7e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -24,6 +24,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
 ; GFX678-LABEL: v_test_canonicalize_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -80,6 +83,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -91,6 +97,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -140,6 +149,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -196,6 +208,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -253,6 +268,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -309,6 +327,9 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: test_fold_canonicalize_undef_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -352,6 +373,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -395,6 +419,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -441,6 +468,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -485,6 +515,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, -1.0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -529,6 +562,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
 ; GFX678-LABEL: test_fold_canonicalize_literal_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x41800000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -573,6 +609,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -616,10 +655,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -664,10 +706,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -712,10 +757,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -760,6 +808,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -804,6 +855,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -850,6 +904,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x807fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -894,6 +951,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
 ; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -938,6 +998,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -982,6 +1045,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1026,6 +1092,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1070,6 +1139,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1114,6 +1186,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1158,6 +1233,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1202,6 +1280,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
 ; GFX678-LABEL: v_test_canonicalize_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1257,6 +1338,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX6-LABEL: s_test_canonicalize_var_f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_max_f64 v[2:3], s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -1267,6 +1351,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX8-LABEL: s_test_canonicalize_var_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -1313,6 +1400,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1369,6 +1459,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1426,6 +1519,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1482,10 +1578,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1531,10 +1630,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1578,10 +1680,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1623,10 +1728,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1668,10 +1776,13 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
 ; GFX678-LABEL: test_fold_canonicalize_literal_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1713,10 +1824,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1762,10 +1876,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, -1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1810,10 +1927,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1857,10 +1977,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, -1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1905,10 +2028,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
 ; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1950,10 +2076,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1995,10 +2124,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2040,10 +2172,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2085,10 +2220,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2130,10 +2268,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2175,10 +2316,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2221,6 +2365,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2238,6 +2385,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2305,6 +2455,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2322,6 +2475,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2389,6 +2545,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2407,6 +2566,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2475,6 +2637,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2498,6 +2663,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2568,6 +2736,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2585,6 +2756,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2652,6 +2826,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2669,6 +2846,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2737,6 +2917,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2755,6 +2938,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2824,6 +3010,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2847,6 +3036,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2916,6 +3108,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -2933,6 +3128,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index fee6540f43c64c..ab00b132f2fd1a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -6,18 +6,13 @@
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga | FileCheck -check-prefix=NOHSA-NOADDR64 -check-prefix=ALL %s
 
 
-; There are no stack objects even though flat is used by default, so
-; flat_scratch_init should be disabled.
-
 ; ALL-LABEL: {{^}}test:
 
-; ALL-NOT: flat_scr
-
 ; HSA-DEFAULT: flat_store_dword
 ; HSA-NODEFAULT: buffer_store_dword
 ; HSA-NOADDR64: flat_store_dword
 
-; HSA: .amdhsa_user_sgpr_flat_scratch_init 0
+; HSA: .amdhsa_user_sgpr_flat_scratch_init 1
 
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 02eb1ad9453291..134b7902380862 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefix=CI -check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=FIJI-NOXNACK -check-prefix=GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
@@ -8,16 +8,16 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 
@@ -25,11 +25,15 @@
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 8
-; VI-NOXNACK: ; TotalNumSgprs: 8
-; VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
+; CI: ; NumSgprs: 8
+; VI-NOXNACK: ; NumSgprs: 8
+; VI-XNACK: ; NumSgprs: 12
+; HSA-VI-NOXNACK: ; NumSgprs: 8
+; HSA-VI-XNACK: ; NumSgprs: 12
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -42,11 +46,15 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 10
-; VI-NOXNACK: ; TotalNumSgprs: 10
-; VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
+; CI: ; NumSgprs: 10
+; VI-NOXNACK: ; NumSgprs: 10
+; VI-XNACK: ; NumSgprs: 12
+; HSA-VI-NOXNACK: ; NumSgprs: 10
+; HSA-VI-XNACK: ; NumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -59,11 +67,16 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 14
-; VI-XNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
+; CI: ; NumSgprs: 12
+; FIJI-NOXNACK: ; NumSgprs: 14
+; VI-NOXNACK: ; NumSgprs: 14
+; VI-XNACK: ; NumSgprs: 14
+; HSA-VI-NOXNACK: ; NumSgprs: 24
+; HSA-VI-XNACK: ; NumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -76,11 +89,13 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 14
-; VI-XNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
+; CI: ; NumSgprs: 12
+; VI-NOXNACK: ; NumSgprs: 14
+; VI-XNACK: ; NumSgprs: 14
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -96,11 +111,13 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: TotalNumSgprs: 4
-; VI-NOXNACK: TotalNumSgprs: 6
-; VI-XNACK: TotalNumSgprs: 6
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 6
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -113,11 +130,13 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: TotalNumSgprs: 4
-; VI-NOXNACK: TotalNumSgprs: 6
-; VI-XNACK: TotalNumSgprs: 6
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 6
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -130,11 +149,13 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: TotalNumSgprs: 4
-; VI-NOXNACK: TotalNumSgprs: 6
-; VI-XNACK: TotalNumSgprs: 6
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 6
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index c60b9858abd836..23b453438f3a0f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -16,6 +16,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
 ; VI-LABEL: multiple_fadd_use_test_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_add_f32_e64 v0, s3, -1.0
 ; VI-NEXT:    v_add_f32_e64 v1, s2, -1.0
@@ -82,8 +85,11 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x2c
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_add_u32 s2, s0, 4
 ; VI-NEXT:    v_add_f32_e64 v2, s4, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -143,6 +149,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
 ; VI-LABEL: multiple_use_fadd_fmad_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_add_u32 s4, s0, 4
@@ -200,6 +209,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s6, s4, 4
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
@@ -263,6 +275,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e64 v0, s2, -4.0
 ; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
@@ -313,10 +328,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc0c00000
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -362,6 +380,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
+; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-DENORM-NEXT:    v_add_f16_e64 v0, s2, -1.0
@@ -380,6 +401,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
+; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-FLUSH-NEXT:    v_add_f16_e64 v0, s2, -1.0
@@ -498,6 +522,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
+; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s3
@@ -519,6 +546,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
+; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
@@ -619,6 +649,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
+; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s3
@@ -640,6 +673,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
+; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s3
@@ -742,6 +778,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; VI-DENORM-NEXT:    s_load_dword s6, s[6:7], 0x8
+; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
+; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s0, s0, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,6 +787,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-DENORM-NEXT:    v_fma_f16 v3, |s6|, 2.0, v0
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
+; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_add_u32 s4, s2, 2
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-DENORM-NEXT:    s_addc_u32 s5, s3, 0
@@ -765,6 +804,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; VI-FLUSH-NEXT:    s_load_dword s6, s[6:7], 0x8
+; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
+; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s0, s0, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
@@ -772,6 +813,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-FLUSH-NEXT:    v_mad_f16 v3, |s6|, 2.0, v0
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
+; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_add_u32 s4, s2, 2
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-FLUSH-NEXT:    s_addc_u32 s5, s3, 0
@@ -875,6 +917,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, s2, -4.0
 ; VI-NEXT:    v_mul_f16_e32 v2, s2, v0
@@ -928,10 +973,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc600
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v0, s2, v0
 ; VI-NEXT:    v_mul_f16_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 8267bb9f5450f8..c4f13749251a4c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -8,6 +8,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
 ; CI-LABEL: fneg_fabs_fadd_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -25,6 +28,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -71,6 +77,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
 ; CI-LABEL: fneg_fabs_fmul_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s1, s0, 0x7fff
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -89,6 +98,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -139,6 +151,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -151,6 +166,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitset1_b32 s2, 15
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -195,6 +213,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -207,6 +228,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitset1_b32 s2, 15
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -249,6 +273,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CIVI-LABEL: v_fneg_fabs_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -294,6 +321,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s1
@@ -316,7 +346,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x4000
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -325,6 +357,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -369,6 +402,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -381,6 +417,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -423,6 +462,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
 ; CIVI-LABEL: fneg_fabs_v4f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_or_b32 s3, s3, 0x80008000
 ; CIVI-NEXT:    s_or_b32 s2, s2, 0x80008000
@@ -468,6 +510,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; CI-LABEL: fold_user_fneg_fabs_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s1|
@@ -489,7 +534,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc400
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -497,6 +544,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; VI-NEXT:    v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -538,6 +586,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
@@ -555,6 +606,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
@@ -611,6 +665,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_bfe_u32 s0, s4, 0xf0010
@@ -635,7 +692,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xc400
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_lshr_b32 s1, s4, 16
@@ -644,6 +703,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; VI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
 ; VI-NEXT:    v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e64 v5, |s4|, -4.0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; VI-NEXT:    v_mov_b32_e32 v5, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 98b17bbaa0a959..7137834b6552e0 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1478,6 +1478,8 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x6
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bitcmp1_b32 s8, 0
 ; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1489,6 +1491,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 40982347f3ca00..6e22c9c319f698 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -10,6 +10,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -22,6 +25,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -66,6 +72,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -80,6 +89,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -129,6 +141,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -141,6 +156,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -183,6 +201,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CI-LABEL: v_fneg_fold_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -200,6 +221,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-LABEL: v_fneg_fold_f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -246,6 +270,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -258,6 +285,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -299,14 +329,17 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
 ; CIVI-LABEL: s_fneg_v2f16_nonload:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
 ; CIVI-NEXT:    ;;#ASMSTART
 ; CIVI-NEXT:    ; def s2
 ; CIVI-NEXT:    ;;#ASMEND
 ; CIVI-NEXT:    s_xor_b32 s2, s2, 0x80008000
-; CIVI-NEXT:    v_mov_b32_e32 v2, s2
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    flat_store_dword v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
@@ -349,6 +382,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -363,6 +399,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -412,6 +451,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -424,6 +466,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -466,6 +511,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; CI-LABEL: v_fneg_fold_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -492,6 +540,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; GFX8-LABEL: v_fneg_fold_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -539,6 +590,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; CI-LABEL: v_extract_fneg_fold_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -560,6 +614,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; GFX8-LABEL: v_extract_fneg_fold_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -624,6 +681,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 3735c6349fbb31..6207c442f41eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -10,6 +10,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -21,6 +24,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -48,6 +54,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -59,6 +68,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -85,6 +97,9 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
 ; CIVI-LABEL: load_v3f16_arg:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_add_u32 s4, s0, 4
 ; CIVI-NEXT:    s_addc_u32 s5, s1, 0
@@ -120,6 +135,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ; CIVI-LABEL: load_v4f16_arg:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v2, s2
@@ -147,6 +165,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v4, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -161,6 +182,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -193,6 +217,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s3, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -206,6 +233,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -239,6 +269,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -250,6 +283,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -279,6 +315,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s3, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -292,6 +331,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -324,6 +366,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; CI-LABEL: extload_v3f16_to_v3f32_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
@@ -337,6 +382,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; VI-LABEL: extload_v3f16_to_v3f32_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
@@ -369,6 +417,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; CI-LABEL: extload_v4f16_to_v4f32_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s3, 16
 ; CI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -384,6 +435,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; VI-LABEL: extload_v4f16_to_v4f32_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -421,6 +475,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s6, s1, 16
 ; CI-NEXT:    s_lshr_b32 s7, s0, 16
@@ -449,6 +506,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s6, s1, 16
 ; VI-NEXT:    s_lshr_b32 s7, s0, 16
@@ -507,6 +567,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
 ; CI-LABEL: extload_f16_to_f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -520,6 +583,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
 ; VI-LABEL: extload_f16_to_f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -553,6 +619,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
 ; CI-LABEL: extload_v2f16_to_v2f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s1
@@ -569,6 +638,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
 ; VI-LABEL: extload_v2f16_to_v2f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s1
@@ -608,6 +680,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; CI-LABEL: extload_v3f16_to_v3f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
@@ -629,6 +704,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; VI-LABEL: extload_v3f16_to_v3f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
@@ -676,6 +754,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; CI-LABEL: extload_v4f16_to_v4f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
@@ -701,6 +782,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; VI-LABEL: extload_v4f16_to_v4f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s5, s3, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s3
@@ -756,6 +840,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s6, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s6
@@ -803,6 +890,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s6, s0, 16
 ; VI-NEXT:    s_lshr_b32 s8, s2, 16
@@ -890,6 +980,9 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
 ; CIVI-LABEL: global_load_store_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -920,6 +1013,9 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: global_load_store_v2f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -950,6 +1046,9 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add
 ; CIVI-LABEL: global_load_store_v4f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
@@ -980,6 +1079,9 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: global_load_store_v8f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1010,6 +1112,9 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
 ; CIVI-LABEL: global_extload_f16_to_f32:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1043,6 +1148,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v2f16_to_v2f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1059,6 +1167,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v2f16_to_v2f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1096,6 +1207,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v3f16_to_v3f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1113,6 +1227,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v3f16_to_v3f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1152,6 +1269,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v4f16_to_v4f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1171,6 +1291,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v4f16_to_v4f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1213,6 +1336,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v8f16_to_v8f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1243,6 +1369,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v8f16_to_v8f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1301,6 +1430,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; CI-LABEL: global_extload_v16f16_to_v16f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s4, s2, 16
 ; CI-NEXT:    s_addc_u32 s5, s3, 0
@@ -1359,6 +1491,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; VI-LABEL: global_extload_v16f16_to_v16f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1458,6 +1593,9 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
 ; CIVI-LABEL: global_extload_f16_to_f64:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1494,6 +1632,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v2f16_to_v2f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1512,6 +1653,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v2f16_to_v2f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1554,6 +1698,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v3f16_to_v3f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1579,6 +1726,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v3f16_to_v3f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1632,6 +1782,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v4f16_to_v4f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1660,6 +1813,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v4f16_to_v4f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1719,6 +1875,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v8f16_to_v8f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1767,6 +1926,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v8f16_to_v8f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1853,6 +2015,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; CI-LABEL: global_extload_v16f16_to_v16f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1948,6 +2113,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; VI-LABEL: global_extload_v16f16_to_v16f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2103,6 +2271,9 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
 ; CIVI-LABEL: global_truncstore_f32_to_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2136,6 +2307,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v2f32_to_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2153,6 +2327,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v2f32_to_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2191,6 +2368,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v3f32_to_v3f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2214,6 +2394,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v3f32_to_v3f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2261,6 +2444,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v4f32_to_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2282,6 +2468,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v4f32_to_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2326,6 +2515,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v8f32_to_v8f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2361,6 +2553,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v8f32_to_v8f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2426,6 +2621,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; CI-LABEL: global_truncstore_v16f32_to_v16f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s4, s2, 32
 ; CI-NEXT:    s_addc_u32 s5, s3, 0
@@ -2495,6 +2693,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; VI-LABEL: global_truncstore_v16f32_to_v16f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s4, s2, 32
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
@@ -2607,6 +2808,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; CI-LABEL: fadd_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -2624,6 +2828,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
@@ -2656,6 +2863,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ; CI-LABEL: fadd_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
@@ -2677,6 +2887,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ; VI-LABEL: fadd_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -2710,6 +2923,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-LABEL: fadd_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2747,6 +2963,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-LABEL: fadd_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2787,6 +3006,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 {
 ; CI-LABEL: fadd_v8f16:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2845,6 +3067,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
 ;
 ; VI-LABEL: fadd_v8f16:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2909,6 +3134,9 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
 ; CIVI-LABEL: test_bitcast_from_half:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2940,6 +3168,9 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
 ; CIVI-LABEL: test_bitcast_to_half:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 8c017fa5ec2636..51b0e2b86cdf39 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,10 @@
 ; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
-; CHECK:     .sgpr_count:     10
+; GFX700:     .sgpr_count:     22
+; GFX803:     .sgpr_count:     24
+; GFX900:     .sgpr_count:     10
+; GFX1010:    .sgpr_count:     10
 ; CHECK:     .symbol:         test.kd
 ; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64
@@ -48,8 +51,8 @@ entry:
 
 ; CHECK:   .name:       num_spilled_sgprs
 ; GFX700:   .sgpr_spill_count: 10
-; GFX803:   .sgpr_spill_count: 10
-; GFX900:   .sgpr_spill_count: 62
+; GFX803:   .sgpr_spill_count: 0
+; GFX900:   .sgpr_spill_count: 0
 ; GFX1010:  .sgpr_spill_count: 60
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
 define amdgpu_kernel void @num_spilled_sgprs(
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 37476203fbfad6..2c38e201d326f6 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -43,7 +43,7 @@
 ; ELF:   00E0: 6E616D65 A673696D 706C65BB 2E707269
 ; ELF:   00F0: 76617465 5F736567 6D656E74 5F666978
 ; ELF:   0100: 65645F73 697A6500 AB2E7367 70725F63
-; ELF:   0110: 6F756E74 06B12E73 6770725F 7370696C
+; ELF:   0110: 6F756E74 0EB12E73 6770725F 7370696C
 ; ELF:   0120: 6C5F636F 756E7400 A72E7379 6D626F6C
 ; ELF:   0130: A973696D 706C652E 6B64AB2E 76677072
 ; ELF:   0140: 5F636F75 6E7403B1 2E766770 725F7370
@@ -59,7 +59,7 @@
 ; ELF:   01E0: 73696D70 6C655F6E 6F5F6B65 726E6172
 ; ELF:   01F0: 6773BB2E 70726976 6174655F 7365676D
 ; ELF:   0200: 656E745F 66697865 645F7369 7A6500AB
-; ELF:   0210: 2E736770 725F636F 756E7400 B12E7367
+; ELF:   0210: 2E736770 725F636F 756E740C B12E7367
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index c998a4b19121e1..8f9b223c361d60 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -10,11 +10,11 @@
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
-; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V4-NEXT:    s_cselect_b32 s3, s3, 0
@@ -36,11 +36,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX8V5-LABEL: addrspacecast:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V5-NEXT:    s_cselect_b32 s2, s2, 0
@@ -62,9 +62,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
@@ -88,9 +88,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
@@ -124,6 +124,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -136,6 +139,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xcc
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -178,6 +184,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -190,6 +199,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xc8
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -273,7 +285,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
 ; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
@@ -298,6 +313,9 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
index f419d89a7f0a44..b283a8fca8a394 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
@@ -68,6 +68,6 @@ if.end:
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 292722c2607add..ff5c71b65f34bd 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %11
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %12
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %9
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %9
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %10
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %11
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %11
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %12
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %12
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %9
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6619145 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %10
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -46,16 +46,16 @@ define amdgpu_kernel void @v_input_output_i128() {
 define amdgpu_kernel void @a_input_output_i128() {
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %11
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %11
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %12
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %12
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %9
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %10
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index c68138acc9b2bf..cf6ae3d5b4f684 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; VI-LABEL: s_insertelement_v2bf16_0:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -82,6 +85,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; VI-LABEL: s_insertelement_v2bf16_1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -144,6 +150,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -216,6 +225,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -286,6 +298,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -358,6 +373,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -435,11 +453,14 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -530,14 +551,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
@@ -609,14 +633,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
@@ -686,14 +713,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -765,14 +795,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
@@ -848,9 +881,12 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -942,9 +978,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
@@ -1058,9 +1097,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -1237,11 +1279,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -1409,11 +1454,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 647870f0e08979..aafa4a04a00de9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -20,6 +20,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2i16_0:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -69,6 +72,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -85,6 +91,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -140,6 +149,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -160,6 +172,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -222,6 +237,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -237,6 +255,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -293,6 +314,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -312,6 +336,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -382,6 +409,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -404,6 +434,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -475,6 +508,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2i16_1:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -523,6 +559,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -539,6 +578,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -589,6 +631,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2f16_0:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -636,6 +681,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2f16_1:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -684,6 +732,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -702,6 +753,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -760,9 +814,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -779,9 +836,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -838,6 +898,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -856,6 +919,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -913,6 +979,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -931,6 +1000,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -987,6 +1059,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1005,6 +1080,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1061,6 +1139,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1079,6 +1160,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1135,6 +1219,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1153,6 +1240,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1209,6 +1299,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1227,6 +1320,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1283,6 +1379,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1301,6 +1400,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1363,6 +1465,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1382,6 +1487,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1446,9 +1554,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -1467,9 +1578,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -1535,11 +1649,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1562,11 +1679,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1637,14 +1757,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
@@ -1656,9 +1779,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1718,14 +1844,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
@@ -1737,9 +1866,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1800,14 +1932,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -1819,9 +1954,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1881,14 +2019,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
@@ -1900,9 +2041,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1963,14 +2107,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -1982,9 +2129,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2050,6 +2200,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    flat_load_dword v4, v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
@@ -2077,6 +2230,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    flat_load_dword v4, v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
@@ -2160,9 +2316,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2186,9 +2345,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2258,9 +2420,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
@@ -2278,9 +2443,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2341,9 +2509,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2361,9 +2532,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2455,9 +2629,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2509,9 +2686,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2656,11 +2836,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2683,9 +2866,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s3
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, s2, v8
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
@@ -2761,12 +2947,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2774,6 +2962,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
+; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v3, s4, v3, v12
@@ -2787,11 +2976,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2928,11 +3120,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -3025,11 +3220,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[7:10], v[2:3]
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index a2f55d6aa83968..758e8b6e1e2ee1 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
 ; GCN-NOT: writelane
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; NumSgprs: 33
+; GCN: ; NumSgprs: 37
 ; GCN: ; NumVgprs: 9
 define amdgpu_kernel void @kernel_call() #0 {
   %vgpr = load volatile i32, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index 496a1c652da251..1a32953305bbc3 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -7,7 +7,7 @@ declare void @llvm.trap() #0
 ; DOORBELL-NEXT:     .amdhsa_group_segment_fixed_size 0
 ; DOORBELL-NEXT:     .amdhsa_private_segment_fixed_size 0
 ; DOORBELL-NEXT:     .amdhsa_kernarg_size 8
-; DOORBELL-NEXT:     .amdhsa_user_sgpr_count 12
+; DOORBELL-NEXT:     .amdhsa_user_sgpr_count 14
 ; DOORBELL-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; DOORBELL:      .end_amdhsa_kernel
 
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index aebc8315514fbc..d51ace630f6925 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -113,16 +113,24 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -168,16 +176,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -223,16 +239,24 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -278,16 +302,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -320,7 +352,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -334,8 +371,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 0
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -348,15 +385,24 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    s_mov_b32 s15, 4
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -383,7 +429,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -397,8 +448,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 2
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -411,15 +462,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    s_mov_b32 s15, 6
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -446,7 +506,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -460,8 +525,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 1
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -474,15 +539,24 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    s_mov_b32 s15, 5
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -509,7 +583,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -523,8 +602,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 3
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -537,15 +616,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    s_mov_b32 s15, 7
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 8fadfe3d026669..c998a007277933 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -23,8 +23,11 @@ define void @function_lds_id(ptr addrspace(1) %out) {
 define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: kernel_lds_id:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s2, s10, 42
+; GCN-NEXT:    s_add_i32 s2, s12, 42
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -42,19 +45,28 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 ; GCN-LABEL: indirect_lds_id:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:    s_add_u32 s8, s8, function_lds_id at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s9, s9, function_lds_id at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GCN-NEXT:    s_add_u32 s8, s6, 8
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GCN-NEXT:    s_addc_u32 s9, s7, 0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, function_lds_id at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, function_lds_id at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GCN-NEXT:    s_mov_b32 s15, 21
-; GCN-NEXT:    s_mov_b32 s12, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
   call void @function_lds_id(ptr addrspace(1) %out)
   ret void
@@ -63,6 +75,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: doesnt_use_it:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x64
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 39a3b1c8adc9f1..44ea414dd4b93c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -259,6 +259,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -269,6 +272,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -284,10 +290,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -296,10 +305,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -312,10 +324,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -324,11 +339,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -341,12 +359,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_m0:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -354,12 +375,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
 ; CHECK-GISEL-LABEL: test_readfirstlane_m0:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -373,25 +397,31 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -405,13 +435,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -419,13 +452,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -439,13 +475,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -453,13 +492,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 24a332fa211c15..ffceac2c912bb6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -179,6 +179,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -189,6 +192,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -204,10 +210,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -216,10 +225,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -232,10 +244,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -244,11 +259,14 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -262,6 +280,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -281,6 +302,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -311,6 +335,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -332,6 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -365,6 +395,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -386,6 +419,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -419,12 +455,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
 ; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -432,12 +471,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
 ; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -454,11 +496,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
@@ -468,10 +513,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; def v0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -485,14 +533,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v[0:1]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -505,10 +556,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ; def v[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -523,14 +577,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v[0:1]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -543,10 +600,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ; def v[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -561,25 +621,31 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -593,13 +659,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -607,13 +676,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -627,13 +699,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -641,13 +716,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 9d93ca65683c42..da40a06c306b98 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -15,6 +15,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -55,6 +58,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -102,6 +108,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s6, s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
@@ -153,6 +162,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[6:7], 0x10
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
@@ -210,6 +222,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s6, s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
@@ -261,6 +276,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[6:7], 0x10
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
@@ -318,6 +336,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -362,6 +383,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -412,6 +436,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -462,6 +489,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -518,11 +548,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
@@ -573,11 +606,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -633,6 +669,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -694,6 +733,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -766,6 +808,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -833,6 +878,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -909,7 +957,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -918,6 +968,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX802-SDAG-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
@@ -980,7 +1031,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -990,6 +1043,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1064,15 +1118,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-SDAG-NEXT:    ;;#ASMSTART
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX802-SDAG-NEXT:    ;;#ASMEND
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    s_mov_b32 s4, m0
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
@@ -1119,15 +1176,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-GISEL-NEXT:    ;;#ASMSTART
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, -1
 ; GFX802-GISEL-NEXT:    ;;#ASMEND
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    s_mov_b32 s4, m0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
@@ -1178,6 +1238,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -1222,6 +1285,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1271,6 +1337,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-SDAG-LABEL: test_writelane_imm_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -1316,6 +1385,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-GISEL-LABEL: test_writelane_imm_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1367,6 +1439,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG-LABEL: test_writelane_imm_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -1412,6 +1487,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL-LABEL: test_writelane_imm_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1464,6 +1542,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
@@ -1503,6 +1584,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
@@ -1548,10 +1632,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
@@ -1596,11 +1683,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1649,10 +1739,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
@@ -1697,11 +1790,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1748,7 +1844,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, 42
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -1782,7 +1881,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, 42
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1822,11 +1924,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
@@ -1867,11 +1972,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1917,11 +2025,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
@@ -1962,11 +2073,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 7202ab8b314669..59cc6dfac12005 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-HSA-LABEL: constant_load_f64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -93,6 +96,9 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
 ; GFX7-HSA-LABEL: constant_load_2v4f64:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[20:21], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 2ee1c60b4bbf2d..5d69aa7d679bed 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -26,6 +26,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: constant_load_i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -108,6 +111,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v2i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -181,6 +187,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v3i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 4
@@ -287,6 +296,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v4i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -363,6 +375,9 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v8i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -449,6 +464,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GCN-HSA-LABEL: constant_load_v16i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -591,6 +609,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GCN-HSA-LABEL: constant_load_v16i16_align2:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
@@ -804,6 +825,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_zextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -881,6 +905,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_sextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -959,6 +986,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1036,6 +1066,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1112,6 +1145,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1199,6 +1235,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1290,6 +1329,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1389,6 +1431,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1493,6 +1538,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1601,6 +1649,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1720,6 +1771,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1880,6 +1934,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2059,6 +2116,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2323,6 +2383,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2631,6 +2694,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3112,6 +3178,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3683,6 +3752,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4606,6 +4678,9 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5394,6 +5469,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_zextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5482,6 +5560,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_sextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5569,6 +5650,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5652,6 +5736,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5738,12 +5825,15 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s0, s2, 16
 ; GCN-HSA-NEXT:    s_and_b32 s1, s2, 0xffff
@@ -5834,6 +5924,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -5939,10 +6032,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s4, s3, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s5, s2, 16
@@ -6076,6 +6172,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6234,10 +6333,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s8, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s2, s7, 16
@@ -6454,6 +6556,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,10 +6822,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s7, 16
@@ -7104,6 +7212,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7582,10 +7693,13 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s19, s1, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s20, s3, 16
@@ -8310,6 +8424,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 4ab55164e09992..6283f6bb3c5e38 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -23,6 +23,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-HSA-LABEL: constant_load_i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -105,6 +108,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v2i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -194,6 +200,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -290,6 +299,9 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v4i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -390,6 +402,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-HSA-LABEL: constant_load_v8i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -526,6 +541,9 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-HSA-LABEL: constant_load_v9i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s12, s[10:11], 0x8
@@ -689,6 +707,9 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX7-HSA-LABEL: constant_load_v10i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[12:13], s[10:11], 0x8
@@ -861,6 +882,9 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v11i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[12:15], s[2:3], 0x8
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
@@ -1038,6 +1062,9 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX7-HSA-LABEL: constant_load_v12i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[12:15], s[10:11], 0x8
@@ -1221,6 +1248,9 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v16i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s18, s16, 48
@@ -1409,6 +1439,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1495,6 +1528,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1587,6 +1623,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1673,6 +1712,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1767,12 +1809,15 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s3
@@ -1867,6 +1912,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1983,13 +2031,16 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GFX7-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s6
@@ -2118,6 +2169,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2282,8 +2336,10 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 48
@@ -2291,6 +2347,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s11
@@ -2491,6 +2548,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -2789,6 +2849,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3245,13 +3308,16 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s18, s16, 0x70
 ; GFX7-HSA-NEXT:    s_addc_u32 s19, s17, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s18
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s19
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s14
@@ -3681,6 +3747,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4539,14 +4608,17 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s34, s36, 0xf0
 ; GFX7-HSA-NEXT:    s_addc_u32 s35, s37, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s34
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s30
@@ -5160,6 +5232,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 46c7c2b08cd64b..45f0af8d423b67 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-LABEL: constant_load_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -97,6 +100,9 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v2i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s0
@@ -183,6 +189,9 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v3i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
@@ -299,6 +308,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-LABEL: constant_load_v4i64:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -430,6 +442,9 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v8i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-NEXT:    s_add_u32 s18, s16, 48
@@ -649,6 +664,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
 ; GFX7-LABEL: constant_load_v16i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index ce17c81a24dd50..43b79973187a6b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
 ; GFX7-HSA-LABEL: constant_load_i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -114,6 +117,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v2i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -199,6 +205,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v3i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -311,6 +320,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v4i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -382,6 +394,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v8i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -458,6 +473,9 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v16i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -541,6 +559,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -618,6 +639,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -696,6 +720,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -773,6 +800,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -854,6 +884,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -955,6 +988,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1054,6 +1090,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1157,6 +1196,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1260,6 +1302,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1366,6 +1411,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1485,6 +1533,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1646,6 +1697,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1830,6 +1884,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2098,6 +2155,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2414,6 +2474,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2898,6 +2961,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3483,6 +3549,9 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4402,6 +4471,9 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5211,6 +5283,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5295,6 +5370,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5382,6 +5460,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5464,6 +5545,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5554,6 +5638,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5663,6 +5750,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5778,10 +5868,13 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_bfe_u32 s4, s2, 0x80008
 ; GFX7-HSA-NEXT:    s_lshr_b32 s3, s2, 24
@@ -5918,6 +6011,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6079,10 +6175,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s4, s2, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s5, s3, 24
@@ -6303,6 +6402,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6574,10 +6676,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s8, s5, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s9, s4, 24
@@ -6968,6 +7073,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7459,10 +7567,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s12, s4, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s13, s5, 24
@@ -8205,6 +8316,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -8979,6 +9093,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9065,6 +9182,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9153,6 +9273,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9239,6 +9362,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9330,6 +9456,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9431,6 +9560,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9545,6 +9677,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -9681,6 +9816,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -9839,6 +9977,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -10043,6 +10184,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -10292,6 +10436,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10650,6 +10797,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11099,6 +11249,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16:
 ; GFX7-HSA:       ; %bb.0:
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -11763,6 +11916,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16:
 ; GFX7-HSA:       ; %bb.0:
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index e0c2d00891250b..6a973d0adeffa5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -28,6 +28,9 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
 ; GCN-HSA-LABEL: global_load_i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -133,6 +136,9 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v2i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -219,6 +225,9 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v3i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -350,6 +359,9 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v4i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -435,6 +447,9 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v8i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -523,6 +538,9 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
 ; GCN-HSA-LABEL: global_load_v16i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
@@ -673,6 +691,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
 ; GCN-HSA-LABEL: global_load_v16i16_align2:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
@@ -822,6 +843,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_zextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -907,6 +931,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_sextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -995,6 +1022,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1080,6 +1110,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1170,6 +1203,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1269,6 +1305,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1370,6 +1409,9 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1480,6 +1522,9 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1597,6 +1642,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1712,6 +1760,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1834,6 +1885,9 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1983,6 +2037,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2147,6 +2204,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -2383,6 +2443,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2654,6 +2717,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -3065,6 +3131,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -3584,6 +3653,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -4388,6 +4460,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5153,6 +5228,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_zextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5250,6 +5328,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_sextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5345,6 +5426,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5437,6 +5521,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5535,6 +5622,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5644,6 +5734,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5762,6 +5855,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5907,6 +6003,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6067,10 +6166,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6085,8 +6184,11 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v4
@@ -6286,6 +6388,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6536,10 +6641,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6556,7 +6661,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v8
@@ -6916,6 +7024,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -7387,6 +7498,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -8089,6 +8203,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 4d7f1a9663c3dc..8322eee8264957 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
 ; GCNX3-HSA-LABEL: global_load_i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -106,6 +109,9 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v2i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -186,6 +192,9 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v3i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -270,6 +279,9 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v4i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -352,6 +364,9 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v8i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
@@ -458,6 +473,9 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v9i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -589,6 +607,9 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v10i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -719,6 +740,9 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v11i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -854,6 +878,9 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v12i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -987,6 +1014,9 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v16i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -1134,6 +1164,9 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
 ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1217,6 +1250,9 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
 ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1301,6 +1337,9 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1384,6 +1423,9 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1471,6 +1513,9 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1568,6 +1613,9 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1670,8 +1718,10 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, 0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1679,6 +1729,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v2
@@ -1796,6 +1847,9 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1937,8 +1991,10 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, 0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1953,6 +2009,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
@@ -2130,6 +2187,9 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2366,6 +2426,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2727,8 +2790,10 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, 0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -2762,6 +2827,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s1
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s0
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
@@ -3118,6 +3184,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -3589,7 +3658,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-GFX900-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-GFX900-HSA-NEXT:    s_add_u32 s16, s16, s13
+; GCN-GFX900-HSA-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-GFX900-HSA-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
@@ -3909,6 +3978,9 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -4433,6 +4505,9 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v32i32:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 4dfc773d615e4f..1a6fa3c518ca7a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -13,7 +13,8 @@
 ; GCN: s_cselect_b32
 
 ; GCN-NOT: load_dword
-; GCN: flat_load_dwordx2
+; GCN: flat_load_dword
+; GCN: flat_load_dword
 ; GCN-NOT: load_dword
 
 ; GCN: flat_store_dwordx2
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 05ad567478675c..12df2b0ed9380d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -161,7 +161,10 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -182,6 +185,7 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
+
   call void @f0()
   call void @f1()
   ret void
@@ -197,7 +201,10 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -244,7 +251,10 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 3453ff9d296c09..72a0aceaae12b6 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -226,7 +226,10 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -270,7 +273,10 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -317,7 +323,10 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index e876a8d9dda692..245a2775d9f2fb 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
 
 ; GCN-LABEL: {{^}}get_global_id_0:
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
-; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]]
+; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0
 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 {
   %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 92536c2078514a..e8632871f56ea4 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX9-NEXT:    s_mul_i32 s10, s10, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s10
+; GFX9-NEXT:    s_mul_i32 s12, s12, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s12
 ; GFX9-NEXT:    v_add_u32_e32 v0, s5, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
@@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX10-NEXT:    s_mul_i32 s10, s10, s4
-; GFX10-NEXT:    v_add3_u32 v0, s5, s10, v0
+; GFX10-NEXT:    s_mul_i32 s12, s12, s4
+; GFX10-NEXT:    v_add3_u32 v0, s5, s12, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0a76e169e9c385..0348737a41a30f 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
 ; CHECK-LABEL: memcpy_p0_p0_minsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
@@ -186,7 +188,7 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s13
+; CHECK-NEXT:    s_add_u32 s16, s16, s15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
 ; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
@@ -566,7 +568,9 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
-; CHECK-NEXT:    s_add_u32 s16, s16, s13
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; CHECK-NEXT:    s_add_u32 s16, s16, s15
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
@@ -981,6 +985,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
 ; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
 ; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
@@ -1255,6 +1261,8 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
 ; CHECK-LABEL: memcpy_p0_p0_optsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
@@ -1432,7 +1440,7 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s13
+; CHECK-NEXT:    s_add_u32 s16, s16, s15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
 ; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
@@ -1812,7 +1820,9 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
-; CHECK-NEXT:    s_add_u32 s16, s16, s13
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; CHECK-NEXT:    s_add_u32 s16, s16, s15
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
@@ -2227,6 +2237,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
 ; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
 ; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 05ef2698c1f774..86cdf3ccd04415 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -34,10 +34,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -56,10 +59,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -146,6 +152,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_imin_sle_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -157,6 +166,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_imin_sle_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -218,6 +230,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ; CI-LABEL: s_test_imin_sle_v1i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -229,6 +244,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ; VI-LABEL: s_test_imin_sle_v1i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -292,6 +310,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ;
 ; CI-LABEL: s_test_imin_sle_v4i32:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -310,6 +331,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ;
 ; VI-LABEL: s_test_imin_sle_v4i32:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -422,11 +446,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i8 s2, s2
 ; CI-NEXT:    s_sext_i32_i8 s3, s3
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_byte v[0:1], v2
@@ -437,11 +464,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i8 s2, s2
 ; VI-NEXT:    s_sext_i32_i8 s3, s3
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -559,6 +589,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s4, s2, 24
 ; CI-NEXT:    s_sext_i32_i8 s5, s2
@@ -582,6 +614,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; CI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; CI-NEXT:    s_or_b32 s2, s3, s2
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -591,7 +624,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s4, s2, 24
 ; VI-NEXT:    s_bfe_i32 s5, s2, 0x80010
@@ -615,6 +649,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_or_b32 s2, s2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -769,6 +804,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; CI-LABEL: s_test_imin_sle_v2i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s4, s2, 16
 ; CI-NEXT:    s_sext_i32_i16 s2, s2
@@ -788,6 +826,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; VI-LABEL: s_test_imin_sle_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s4, s2, 16
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
@@ -922,6 +963,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s6, s0, 16
 ; CI-NEXT:    s_ashr_i32 s7, s1, 16
@@ -952,6 +996,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s6, s1, 16
 ; VI-NEXT:    s_sext_i32_i16 s1, s1
@@ -1050,10 +1097,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1072,10 +1122,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1191,10 +1244,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1213,10 +1269,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1304,6 +1363,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_imin_slt_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1315,6 +1377,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_imin_slt_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1378,6 +1443,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s1, s1, s3
 ; CI-NEXT:    s_min_i32 s0, s0, s2
@@ -1392,6 +1460,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s1, s1, s3
 ; VI-NEXT:    s_min_i32 s0, s0, s2
@@ -1466,6 +1537,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1478,6 +1552,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1545,6 +1622,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1557,6 +1637,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1636,10 +1719,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1658,10 +1744,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1767,12 +1856,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v6
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v2, s5
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, s4, v6
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; CI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; CI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
@@ -1791,12 +1883,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v6
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; VI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
@@ -1930,12 +2025,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
@@ -1966,12 +2064,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
@@ -2070,6 +2171,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_umin_ule_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2081,6 +2185,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_umin_ule_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2155,10 +2262,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2177,10 +2287,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2286,6 +2399,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s3
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
@@ -2307,6 +2423,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v0
@@ -2394,6 +2513,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_umin_ult_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2405,6 +2527,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_umin_ult_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2488,6 +2613,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; CI-LABEL: v_test_umin_ult_i32_multi_use:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s5, s[6:7], 0x0
@@ -2509,6 +2637,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; VI-LABEL: v_test_umin_ult_i32_multi_use:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s5, s[6:7], 0x0
@@ -2638,6 +2769,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; CI-LABEL: v_test_umin_ult_i16_multi_use:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
@@ -2660,6 +2794,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; VI-LABEL: v_test_umin_ult_i16_multi_use:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
@@ -2752,6 +2889,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ; CI-LABEL: s_test_umin_ult_v1i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2763,6 +2903,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ; VI-LABEL: s_test_umin_ult_v1i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2834,6 +2977,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ;
 ; CI-LABEL: s_test_umin_ult_v8i32:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x8
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2865,6 +3011,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ;
 ; VI-LABEL: s_test_umin_ult_v8i32:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3125,6 +3274,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ;
 ; CI-LABEL: s_test_umin_ult_v8i16:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3171,6 +3323,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ;
 ; VI-LABEL: s_test_umin_ult_v8i16:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3296,11 +3451,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; CI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -3311,11 +3469,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3405,11 +3566,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i16 s2, s2
 ; CI-NEXT:    s_sext_i32_i16 s3, s3
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -3420,11 +3584,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3521,6 +3688,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i16 s3, s2
 ; CI-NEXT:    s_ashr_i32 s2, s2, 16
@@ -3535,6 +3705,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i16 s3, s2
 ; VI-NEXT:    s_ashr_i32 s2, s2, 16
@@ -3617,6 +3790,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3635,6 +3811,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3727,6 +3906,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3745,6 +3927,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3837,6 +4022,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3855,6 +4043,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3947,6 +4138,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3965,6 +4159,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4081,9 +4278,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
@@ -4112,10 +4312,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4233,9 +4436,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
@@ -4263,10 +4469,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index 5792fab7011afe..d7814c52828b88 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -180,6 +180,9 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -260,6 +263,9 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -341,6 +347,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -403,6 +412,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -465,6 +477,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -527,6 +542,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -588,6 +606,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 529e64715500dd..3de6945f95556b 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -176,6 +176,9 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1)
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -254,6 +257,9 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -333,6 +339,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -393,6 +402,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -454,6 +466,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -514,6 +529,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 37d0309caac0ad..993eb11ef5e953 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -10,32 +10,36 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908: bb.0 (%ir-block.0):
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %6
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %27
+  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %27
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %24
+  ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
-  ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX908-LABEL: name: partial_copy
   ; PEI-GFX908: bb.0 (%ir-block.0):
-  ; PEI-GFX908-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+  ; PEI-GFX908-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
   ; PEI-GFX908-NEXT: {{  $}}
-  ; PEI-GFX908-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
-  ; PEI-GFX908-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
-  ; PEI-GFX908-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+  ; PEI-GFX908-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+  ; PEI-GFX908-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+  ; PEI-GFX908-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
@@ -44,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -55,31 +59,34 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A: bb.0 (%ir-block.0):
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %6
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %26
+  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %26
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %24
+  ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX90A-LABEL: name: partial_copy
   ; PEI-GFX90A: bb.0 (%ir-block.0):
-  ; PEI-GFX90A-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+  ; PEI-GFX90A-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
   ; PEI-GFX90A-NEXT: {{  $}}
-  ; PEI-GFX90A-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
-  ; PEI-GFX90A-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
-  ; PEI-GFX90A-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+  ; PEI-GFX90A-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+  ; PEI-GFX90A-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+  ; PEI-GFX90A-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -87,7 +94,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index 1fd311f225db4a..51e0eb6d497945 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -153,7 +153,7 @@ define internal void @merge_cycle_1() #3 {
 
 define amdgpu_kernel void @kernel_64_256() #7 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel_64_256
-; CHECK-SAME: () #[[ATTR6]] {
+; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
 ; CHECK-NEXT:    call void @merge_cycle_0()
 ; CHECK-NEXT:    call void @default_captured_address()
 ; CHECK-NEXT:    call void @externally_visible_default()
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 9577f2a932f966..e911df4aa4da3b 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -210,7 +210,7 @@ define void @externally_visible_default() {
 ; 1,10 -> 3,8
 define internal i32 @bitcasted_function() {
 ; CHECK-LABEL: define internal i32 @bitcasted_function
-; CHECK-SAME: () #[[ATTR8]] {
+; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   ret i32 0
@@ -407,7 +407,7 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 8792e60bb0ca16..667db7b1ebbd0b 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -148,7 +148,7 @@ define amdgpu_kernel void @kernel_lds() {
 
 define internal i16 @mutual_recursion_0(i16 %arg) {
 ; CHECK-LABEL: define internal i16 @mutual_recursion_0(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4
@@ -168,7 +168,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) {
 
 define internal void @mutual_recursion_1(i16 %arg) {
 ; CHECK-LABEL: define internal void @mutual_recursion_1(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5]] {
 ; CHECK-NEXT:    call void @mutual_recursion_0(i16 [[ARG]])
 ; CHECK-NEXT:    ret void
 ;
@@ -178,7 +178,7 @@ define internal void @mutual_recursion_1(i16 %arg) {
 
 define amdgpu_kernel void @kernel_lds_recursion() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion(
-; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] {
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] !llvm.amdgcn.lds.kernel.id !9 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ]
 ; CHECK-NEXT:    call void @mutual_recursion_0(i16 0)
 ; CHECK-NEXT:    ret void
@@ -196,6 +196,8 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 ; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index c2132cf907fdb2..847c2d343d4153 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -6,6 +6,9 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -33,9 +36,12 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x5a
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_sad_u32 v2, s2, v0, 20
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -57,6 +63,9 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -83,7 +92,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -92,6 +103,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -119,12 +131,15 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_sad_u32 v2, s0, v2, v3
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
@@ -151,7 +166,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s3, s0, s1
@@ -162,6 +179,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -186,7 +204,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -197,6 +217,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -222,7 +243,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s3, s0, s1
@@ -233,6 +256,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -255,7 +279,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -264,6 +290,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -285,6 +312,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; GCN-LABEL: v_sad_u32_vector_pat1:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0xc
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -321,6 +351,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; GCN-LABEL: v_sad_u32_vector_pat2:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0xc
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -358,6 +391,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 ; GCN-NEXT:    s_load_dword s4, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 16
@@ -365,6 +400,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_sad_u32 v2, s4, v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -384,6 +420,9 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sad_u32_i16_pat2:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_ushort v0, v[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -416,6 +455,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s2, 0xff
 ; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
@@ -443,6 +485,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
 define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sad_u32_i8_pat2:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -475,6 +520,9 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s2, 0xff
 ; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
@@ -502,6 +550,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s6, s0, s1
 ; GCN-NEXT:    s_cmp_le_u32 s0, s1
@@ -531,6 +582,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s3, s0, s3
 ; GCN-NEXT:    s_sub_i32 s6, s1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 89a09dc4fcc171..9826585df8bd80 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -24,6 +26,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX906-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX906-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -39,6 +43,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX908-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -55,6 +61,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -88,6 +96,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -103,6 +113,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX906-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX906-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -118,6 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX908-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -134,6 +148,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 7f8240eeb98ebf..5d0ddcc7114c26 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -24,175 +24,179 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    v_writelane_b32 v22, s2, 0
 ; CHECK-NEXT:    v_writelane_b32 v22, s3, 1
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:7]
+; CHECK-NEXT:    ; def s[48:51]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v22, s4, 2
-; CHECK-NEXT:    v_writelane_b32 v22, s5, 3
-; CHECK-NEXT:    v_writelane_b32 v22, s6, 4
-; CHECK-NEXT:    v_writelane_b32 v22, s7, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v22, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v22, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v22, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v22, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v22, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v22, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v22, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v22, s11, 13
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 2
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 3
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 4
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 5
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 6
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 7
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 8
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 9
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:19]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v22, s4, 14
-; CHECK-NEXT:    v_writelane_b32 v22, s5, 15
-; CHECK-NEXT:    v_writelane_b32 v22, s6, 16
-; CHECK-NEXT:    v_writelane_b32 v22, s7, 17
-; CHECK-NEXT:    v_writelane_b32 v22, s8, 18
-; CHECK-NEXT:    v_writelane_b32 v22, s9, 19
-; CHECK-NEXT:    v_writelane_b32 v22, s10, 20
-; CHECK-NEXT:    v_writelane_b32 v22, s11, 21
-; CHECK-NEXT:    v_writelane_b32 v22, s12, 22
-; CHECK-NEXT:    v_writelane_b32 v22, s13, 23
-; CHECK-NEXT:    v_writelane_b32 v22, s14, 24
-; CHECK-NEXT:    v_writelane_b32 v22, s15, 25
-; CHECK-NEXT:    v_writelane_b32 v22, s16, 26
-; CHECK-NEXT:    v_writelane_b32 v22, s17, 27
-; CHECK-NEXT:    v_writelane_b32 v22, s18, 28
-; CHECK-NEXT:    v_writelane_b32 v22, s19, 29
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 10
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 11
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 12
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 13
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 14
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 15
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 16
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 17
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 18
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 19
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 20
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 21
+; CHECK-NEXT:    v_writelane_b32 v23, s16, 22
+; CHECK-NEXT:    v_writelane_b32 v23, s17, 23
+; CHECK-NEXT:    v_writelane_b32 v23, s18, 24
+; CHECK-NEXT:    v_writelane_b32 v23, s19, 25
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[42:43]
+; CHECK-NEXT:    ; def s[38:39]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[52:55]
+; CHECK-NEXT:    ; def s[44:47]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v22, s4, 30
-; CHECK-NEXT:    v_writelane_b32 v22, s5, 31
-; CHECK-NEXT:    v_writelane_b32 v22, s6, 32
-; CHECK-NEXT:    v_writelane_b32 v22, s7, 33
-; CHECK-NEXT:    v_writelane_b32 v22, s8, 34
-; CHECK-NEXT:    v_writelane_b32 v22, s9, 35
-; CHECK-NEXT:    v_writelane_b32 v22, s10, 36
-; CHECK-NEXT:    v_writelane_b32 v22, s11, 37
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 26
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 27
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 28
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 29
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 30
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 31
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 32
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 33
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[40:41]
+; CHECK-NEXT:    ; def s[36:37]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[36:39]
+; CHECK-NEXT:    ; def s[40:43]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[44:51]
+; CHECK-NEXT:    ; def s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 34
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 35
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 36
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 37
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 38
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 39
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 40
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v22, s0, 38
-; CHECK-NEXT:    v_writelane_b32 v22, s1, 39
-; CHECK-NEXT:    v_writelane_b32 v22, s2, 40
-; CHECK-NEXT:    v_writelane_b32 v22, s3, 41
-; CHECK-NEXT:    v_writelane_b32 v22, s4, 42
-; CHECK-NEXT:    v_writelane_b32 v22, s5, 43
-; CHECK-NEXT:    v_writelane_b32 v22, s6, 44
-; CHECK-NEXT:    v_writelane_b32 v22, s7, 45
-; CHECK-NEXT:    v_writelane_b32 v22, s8, 46
-; CHECK-NEXT:    v_writelane_b32 v22, s9, 47
-; CHECK-NEXT:    v_writelane_b32 v22, s10, 48
-; CHECK-NEXT:    v_writelane_b32 v22, s11, 49
-; CHECK-NEXT:    v_writelane_b32 v22, s12, 50
-; CHECK-NEXT:    v_writelane_b32 v22, s13, 51
-; CHECK-NEXT:    v_writelane_b32 v22, s14, 52
-; CHECK-NEXT:    v_writelane_b32 v22, s15, 53
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 42
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 43
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 44
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 45
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 46
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 47
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 48
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 49
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 50
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 51
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 52
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 53
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 54
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 55
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 56
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v22, s0, 54
-; CHECK-NEXT:    v_writelane_b32 v22, s1, 55
-; CHECK-NEXT:    v_writelane_b32 v22, s2, 56
-; CHECK-NEXT:    v_writelane_b32 v22, s3, 57
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 58
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 59
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 60
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 61
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v22, s0, 58
-; CHECK-NEXT:    v_writelane_b32 v22, s1, 59
-; CHECK-NEXT:    v_writelane_b32 v22, s2, 60
-; CHECK-NEXT:    ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_writelane_b32 v22, s3, 61
-; CHECK-NEXT:    v_writelane_b32 v22, s4, 62
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 0
-; CHECK-NEXT:    v_writelane_b32 v22, s5, 63
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 1
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 62
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 1
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 4
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 63
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 2
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 3
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 4
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 5
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 14
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 15
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 16
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s8, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s9, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s10, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s11, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s12, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s13, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s14, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s15, 21
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 18
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 23
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 20
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 21
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 22
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 23
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 24
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 25
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 26
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 27
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 24
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 25
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 26
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 27
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 28
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 29
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 30
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 31
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 31
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 35
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 32
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 33
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 34
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 35
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 36
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 37
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 38
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 39
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 40
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 41
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 42
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 43
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 44
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 45
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 46
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 47
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 36
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 37
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 38
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 39
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 40
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 41
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 42
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 43
+; CHECK-NEXT:    v_writelane_b32 v0, s8, 44
+; CHECK-NEXT:    v_writelane_b32 v0, s9, 45
+; CHECK-NEXT:    v_writelane_b32 v0, s10, 46
+; CHECK-NEXT:    v_writelane_b32 v0, s11, 47
+; CHECK-NEXT:    v_writelane_b32 v0, s12, 48
+; CHECK-NEXT:    v_writelane_b32 v0, s13, 49
+; CHECK-NEXT:    v_writelane_b32 v0, s14, 50
+; CHECK-NEXT:    v_writelane_b32 v0, s15, 51
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %ret
 ; CHECK-NEXT:    s_endpgm
@@ -202,170 +206,174 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 2
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 3
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 4
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 5
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 2
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 6
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 7
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 8
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 9
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:3]
+; CHECK-NEXT:    ; use s[48:51]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 6
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 7
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 8
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 9
-; CHECK-NEXT:    v_readlane_b32 s4, v22, 10
-; CHECK-NEXT:    v_readlane_b32 s5, v22, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v22, 12
-; CHECK-NEXT:    v_readlane_b32 s7, v22, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 14
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 15
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 16
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 17
-; CHECK-NEXT:    v_readlane_b32 s4, v22, 18
-; CHECK-NEXT:    v_readlane_b32 s5, v22, 19
-; CHECK-NEXT:    v_readlane_b32 s6, v22, 20
-; CHECK-NEXT:    v_readlane_b32 s7, v22, 21
-; CHECK-NEXT:    v_readlane_b32 s8, v22, 22
-; CHECK-NEXT:    v_readlane_b32 s9, v22, 23
-; CHECK-NEXT:    v_readlane_b32 s10, v22, 24
-; CHECK-NEXT:    v_readlane_b32 s11, v22, 25
-; CHECK-NEXT:    v_readlane_b32 s12, v22, 26
-; CHECK-NEXT:    v_readlane_b32 s13, v22, 27
-; CHECK-NEXT:    v_readlane_b32 s14, v22, 28
-; CHECK-NEXT:    v_readlane_b32 s15, v22, 29
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 10
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 11
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 12
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 13
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 14
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 15
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 16
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 17
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 18
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 19
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 20
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 21
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 22
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 23
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 24
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 25
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 30
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 31
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 32
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 33
-; CHECK-NEXT:    v_readlane_b32 s4, v22, 34
-; CHECK-NEXT:    v_readlane_b32 s5, v22, 35
-; CHECK-NEXT:    v_readlane_b32 s6, v22, 36
-; CHECK-NEXT:    v_readlane_b32 s7, v22, 37
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 26
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 27
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 28
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 29
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 30
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 31
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 32
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 33
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[42:43]
+; CHECK-NEXT:    ; use s[38:39]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[52:55]
+; CHECK-NEXT:    ; use s[44:47]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 38
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 39
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 40
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 41
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 34
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 35
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 36
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 37
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 38
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 39
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 40
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[40:41]
+; CHECK-NEXT:    ; use s[36:37]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[36:39]
+; CHECK-NEXT:    ; use s[40:43]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[44:51]
+; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s4, v22, 42
-; CHECK-NEXT:    v_readlane_b32 s5, v22, 43
-; CHECK-NEXT:    v_readlane_b32 s6, v22, 44
-; CHECK-NEXT:    v_readlane_b32 s7, v22, 45
-; CHECK-NEXT:    v_readlane_b32 s8, v22, 46
-; CHECK-NEXT:    v_readlane_b32 s9, v22, 47
-; CHECK-NEXT:    v_readlane_b32 s10, v22, 48
-; CHECK-NEXT:    v_readlane_b32 s11, v22, 49
-; CHECK-NEXT:    v_readlane_b32 s12, v22, 50
-; CHECK-NEXT:    v_readlane_b32 s13, v22, 51
-; CHECK-NEXT:    v_readlane_b32 s14, v22, 52
-; CHECK-NEXT:    v_readlane_b32 s15, v22, 53
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 42
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 43
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 44
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 45
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 46
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 47
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 48
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 49
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 50
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 51
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 52
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 53
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 54
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 55
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 56
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 54
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 55
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 56
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 57
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 58
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 59
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 60
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 61
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 58
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 59
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 60
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 61
-; CHECK-NEXT:    v_readlane_b32 s4, v22, 62
-; CHECK-NEXT:    v_readlane_b32 s5, v22, 63
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 0
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 1
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 62
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 63
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 0
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 2
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 3
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 2
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 6
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 7
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 8
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 9
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 10
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 11
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 12
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 13
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 14
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 15
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 16
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 17
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 6
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 8
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s8, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s9, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s10, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s11, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s12, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s13, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s14, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s15, v0, 21
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 18
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 19
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 23
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 20
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 21
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 22
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 23
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 24
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 25
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 26
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 27
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 24
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 25
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 26
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 27
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 28
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 29
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 30
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 31
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 35
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 32
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 33
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 34
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 35
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 36
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 37
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 38
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 39
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 40
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 41
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 42
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 43
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 44
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 45
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 46
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 47
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 36
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 37
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 38
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 39
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 40
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 41
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 42
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 43
+; CHECK-NEXT:    v_readlane_b32 s8, v0, 44
+; CHECK-NEXT:    v_readlane_b32 s9, v0, 45
+; CHECK-NEXT:    v_readlane_b32 s10, v0, 46
+; CHECK-NEXT:    v_readlane_b32 s11, v0, 47
+; CHECK-NEXT:    v_readlane_b32 s12, v0, 48
+; CHECK-NEXT:    v_readlane_b32 s13, v0, 49
+; CHECK-NEXT:    v_readlane_b32 s14, v0, 50
+; CHECK-NEXT:    v_readlane_b32 s15, v0, 51
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index ebc916b5c889b5..f69b7ae1051240 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -182,8 +182,10 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_shl_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -203,6 +205,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -215,8 +218,10 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_lshr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -236,6 +241,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -248,8 +254,10 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_ashr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, 64, s4
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
@@ -270,6 +278,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -430,6 +439,9 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_shl_v2i128ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -502,6 +514,9 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_lshr_v2i128_ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -574,6 +589,9 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_ashr_v2i128_ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index 3902c6dd422fde..8562cdf195b022 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -100,14 +100,8 @@ entry:
 }
 
 ;.
-; NO: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; NO: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-;.
-; OW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
-;.
-; CW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
 ; NO: [[META0]] = !{ptr @bar1, ptr @bar2}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index b8721129222043..a38f0a6d86b8ca 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -9,6 +9,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -20,6 +23,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -38,11 +44,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -52,11 +61,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -72,6 +84,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitcmp1_b32 s2, 0
 ; CI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -86,6 +101,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -104,6 +122,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; CI-LABEL: s_sint_to_fp_i64_to_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s3
 ; CI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -117,6 +138,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-LABEL: s_sint_to_fp_i64_to_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -136,6 +160,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -155,6 +182,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -183,6 +213,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i8 s2, s2
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
@@ -195,6 +228,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bfe_i32 s2, s2, 0x80000
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
@@ -232,11 +268,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -246,11 +285,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -283,11 +325,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -297,11 +342,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -353,11 +401,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0, 0xbff00000
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -367,11 +418,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0, 0xbff00000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 882356d994fc68..0a6009c0e7da8a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,10 +12,12 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %12.sub0
+  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %31.sub0
+  ; GCN-NEXT:   SI_SPILL_V64_SAVE %31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %12
+  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]]
   ; GCN-NEXT:   S_ENDPGM 0
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 3644bef9c20a1f..b2e334c66ccd25 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -50,7 +50,10 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
 define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ; HAWAII-LABEL: local_store_i55:
 ; HAWAII:       ; %bb.0:
+; HAWAII-NEXT:    s_add_i32 s10, s10, s15
 ; HAWAII-NEXT:    s_or_b32 s0, s6, 14
+; HAWAII-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; HAWAII-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
 ; HAWAII-NEXT:    v_mov_b32_e32 v1, s7
 ; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -70,7 +73,10 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ;
 ; FIJI-LABEL: local_store_i55:
 ; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_add_i32 s10, s10, s15
 ; FIJI-NEXT:    s_or_b32 s0, s6, 14
+; FIJI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; FIJI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; FIJI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIJI-NEXT:    v_mov_b32_e32 v1, s7
 ; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 19d633651fdd0d..30accc846d2b6a 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     9
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 2097579e0c9959..4f84b31f1877b0 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000  ................
+; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     5
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 775c62e73261a9..0b1bd11b88d5d6 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefixes=ASM %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s
@@ -6,15 +7,17 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
-; ASM: .amdhsa_reserve_xnack_mask 1
+; ASM:       ; %bb.0: ; %entry
+; ASM-NEXT:    ;;#ASMSTART
+; ASM-NEXT:    ;;#ASMEND
+; ASM-NEXT:    s_endpgm
 
 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
 ; OBJ: Contents of section .rodata:
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000  @.......!.......
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     9
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 4dfd4c095c87a0..2516177691ce3c 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -23,11 +23,14 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-TRAP-GFX803-LABEL: trap:
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s2
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s3
+; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    s_trap 2
@@ -121,6 +124,9 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-TRAP-GFX803-LABEL: non_entry_trap:
 ; HSA-TRAP-GFX803:       ; %bb.0: ; %entry
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
@@ -282,6 +288,9 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s4
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s5
@@ -414,10 +423,13 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-TRAP-GFX803-LABEL: debugtrap:
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v3, 2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v3, 2
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 735956caa72da4..317e350f3eafe5 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -81,6 +81,9 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-LABEL: udiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -252,6 +255,9 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GCN-LABEL: s_udiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GCN-NEXT:    s_sub_i32 s4, 0, s3
@@ -457,6 +463,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: udiv_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -810,6 +819,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: udiv_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -1135,6 +1147,9 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
 ; GCN-LABEL: udiv_i32_div_pow2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1224,6 +1239,9 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: udiv_i32_div_k_even:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1318,6 +1336,9 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: udiv_i32_div_k_odd:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1430,6 +1451,9 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-LABEL: v_udiv_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1570,6 +1594,9 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1726,6 +1753,9 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 4
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -1923,6 +1953,9 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i24:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 4
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -2105,6 +2138,9 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
 ; GCN-LABEL: scalarize_mulhu_4xi32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -2218,6 +2254,9 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
 ; GCN-LABEL: test_udiv2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -2281,6 +2320,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
@@ -2371,6 +2413,9 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
 ;
 ; GCN-LABEL: fdiv_test_denormals:
 ; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_sbyte v2, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index d00ea6dff24474..44b16d7f65dc55 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,6 +9,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -28,6 +31,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -54,6 +60,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; SI-LABEL: s_uint_to_fp_i64_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -67,6 +76,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-LABEL: s_uint_to_fp_i64_to_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -86,6 +98,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
@@ -103,6 +118,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
@@ -126,6 +144,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) {
 ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x8
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -158,6 +179,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
 ;
 ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -196,6 +220,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
@@ -207,6 +234,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -222,6 +252,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
 ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[2:3], s3
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -239,6 +272,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
@@ -259,6 +295,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
@@ -286,11 +325,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -300,11 +342,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -320,6 +365,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bitcmp1_b32 s2, 0
 ; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -334,6 +382,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -353,6 +404,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_and_b32 s2, s2, 0xff
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -365,6 +419,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -402,11 +459,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -416,11 +476,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -453,11 +516,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -467,11 +533,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -505,11 +574,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -519,11 +591,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 4545c8bbeb3e6c..33f629a3c4f0cb 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -25,8 +25,9 @@
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr8' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
@@ -40,7 +41,7 @@
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: body:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 8215ba834170f2..1fbd3760eed26c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -25,8 +25,9 @@
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr8' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
@@ -40,7 +41,7 @@
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: body:

>From fefedb95360dcab3040237c0a4aae0229d59cc6b Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Sat, 17 Aug 2024 16:40:41 -0700
Subject: [PATCH 03/13] Previous code was incorrect for indirect calls of known
 callees.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  17 +-
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       |   3 +-
 .../AMDGPU/attributor-flatscratchinit.ll      | 359 ++++++++++--------
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  |   2 +-
 .../AMDGPU/remove-no-kernel-id-attribute.ll   |   6 +-
 5 files changed, 205 insertions(+), 182 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 3ef0694046553b..03391a36c54c79 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -697,20 +697,23 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
   // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
   // not to be set.
   bool needFlatScratchInit(Attributor &A) {
+    assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
+
     // This is called on each callee; false means callee shouldn't have
     // no-flat-scratch-init.
     auto CheckForNoFlatScratchInit = [&](Instruction &I) {
       const auto &CB = cast<CallBase>(I);
       const Function *Callee = CB.getCalledFunction();
-      if (!Callee) // indirect call
-        return CB.isInlineAsm();
 
-      if (Callee->isIntrinsic())
-        return Callee->getIntrinsicID() != Intrinsic::amdgcn_addrspacecast_nonnull;
+      if (Callee && Callee->isIntrinsic())
+        return Callee->getIntrinsicID() !=
+               Intrinsic::amdgcn_addrspacecast_nonnull;
 
-      const auto *CalleeInfo = A.getAAFor<AAAMDAttributes>(
-          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
-      return CalleeInfo && CalleeInfo->isAssumed(FLAT_SCRATCH_INIT);
+      // Return true for all other cases, including (1)inline asm, (2)direct
+      // call, and (3)indirect call with known callees. For (2) and (3)
+      // updateImpl() already checked the callees and we know their
+      // FLAT_SCRATCH_INIT bit is set.
+      return true;
     };
 
     bool UsedAssumedInformation = false;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 05ee8cabb5e7c4..0e4a9791b6f57d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -229,7 +229,7 @@ define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
 
 define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
-; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
@@ -261,7 +261,6 @@ attributes #0 = { "amdgpu-no-agpr" }
 ; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR10]] = { "amdgpu-no-agpr" }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index 6d9da9281211cb..f04c93961b6705 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -1,6 +1,6 @@
 ; Test the generation of the attribute amdgpu-no-flat-scratch-init
-; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
 
 ;; tests of alloca
 
@@ -160,40 +160,40 @@ define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
 
 define void @call_without_alloca() {
 ; GFX9-LABEL: define void @call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @without_alloca(i1 true)
   ret void
 }
 
 define amdgpu_kernel void @call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @without_alloca(i1 true)
   ret void
 }
 
 define void @call_with_alloca() {
 ; GFX9-LABEL: define void @call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @with_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @with_alloca()
   ret void
 }
@@ -222,50 +222,50 @@ define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
 
 define void @call_call_without_alloca() {
 ; GFX9-LABEL: define void @call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @call_without_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @call_without_alloca()
   ret void
 }
 
 define void @call_call_with_alloca() {
 ; GFX9-LABEL: define void @call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @call_with_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @call_with_alloca()
   ret void
 }
 
 define void @with_alloca_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -274,10 +274,10 @@ define void @with_alloca_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -286,10 +286,10 @@ define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 
 define void @with_alloca_call_with_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @with_alloca()
@@ -298,10 +298,10 @@ define void @with_alloca_call_with_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @with_alloca()
@@ -310,10 +310,10 @@ define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -322,10 +322,10 @@ define void @with_alloca_call_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -334,10 +334,10 @@ define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_with_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_with_alloca()
@@ -346,10 +346,10 @@ define void @with_alloca_call_call_with_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_with_alloca()
@@ -360,30 +360,30 @@ define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -391,10 +391,10 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 
 define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -402,30 +402,30 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -433,10 +433,10 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 
 define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -444,30 +444,30 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -475,10 +475,10 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 
 define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -486,30 +486,30 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp
 
 define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -517,10 +517,10 @@ define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 
 define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -528,30 +528,30 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add
 
 define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -559,10 +559,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -570,50 +570,50 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
 
 define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -621,10 +621,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp
 
 define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -632,78 +632,70 @@ define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacec
 
 define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
-
-
-
-
-
-
-
-
 define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -712,10 +704,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(
 
 define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -724,10 +716,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_
 
 define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -736,10 +728,10 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5)
 
 define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -750,10 +742,10 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_
 
 define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @without_alloca(i1 true)
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -761,10 +753,10 @@ define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr
 
 define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @without_alloca(i1 true)
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -772,10 +764,10 @@ define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kern
 
 define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -783,16 +775,16 @@ define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
-;; tests of indirect call, intrinsics
+;; tests of indirect call, intrinsics, inline asm
 
 @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 
@@ -838,14 +830,33 @@ define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
   ret void
 }
 
+define void @empty() {
+  ret void
+}
+
+define void @also_empty() {
+  ret void
+}
+
+define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) {
+; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+  %fptr = select i1 %cond, ptr @empty, ptr @also_empty
+  call void %fptr()
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI7:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI7:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) undef
   ret void
@@ -853,10 +864,10 @@ define void @use_intrinsic_workitem_id_x() {
 
 define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) undef
   ret void
@@ -864,84 +875,96 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 
 define void @call_use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI7:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4]]
 ;
 ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI7:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI5:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
+define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+  %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
+  store volatile i32 7, ptr %1, align 4
+  ret void
+}
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+  call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+  ret void
+}
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+define amdgpu_kernel void @with_inline_asm() {
+; GFX9-LABEL: with_inline_asm
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3]]
+;
+; GFX10-LABEL: with_inline_asm
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3]]
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  ret void
+}
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
 ; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; GFX9:  attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
 ; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 ; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index e911df4aa4da3b..eb4cf5c063d101 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -407,7 +407,7 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 667db7b1ebbd0b..58a6437bcbf696 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -148,7 +148,7 @@ define amdgpu_kernel void @kernel_lds() {
 
 define internal i16 @mutual_recursion_0(i16 %arg) {
 ; CHECK-LABEL: define internal i16 @mutual_recursion_0(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4
@@ -168,7 +168,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) {
 
 define internal void @mutual_recursion_1(i16 %arg) {
 ; CHECK-LABEL: define internal void @mutual_recursion_1(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5]] {
+; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @mutual_recursion_0(i16 [[ARG]])
 ; CHECK-NEXT:    ret void
 ;
@@ -196,8 +196,6 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 ; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.

>From 0a0025b81e6177ee0a90ca20f2dccd5ce235dfb1 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Wed, 21 Aug 2024 11:27:13 -0700
Subject: [PATCH 04/13] Undo changes in AMDGPUSubtarget.cpp to reduce impact on
 test files. Those code changes will be in a follow-up PR.

---
 .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll  |  368 +----
 .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll  |  390 +----
 .../AMDGPU/GlobalISel/extractelement.ll       |   69 +-
 ...licit-kernarg-backend-usage-global-isel.ll |   36 +-
 .../GlobalISel/insertelement-stack-lower.ll   |    2 +-
 .../AMDGPU/GlobalISel/lds-global-value.ll     |    5 +-
 .../GlobalISel/llvm.amdgcn.if.break.i64.ll    |    3 -
 .../GlobalISel/llvm.amdgcn.trig.preop.ll      |   24 -
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll |   33 -
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |   33 -
 .../abi-attribute-hints-undefined-behavior.ll |   18 +-
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll     |   10 +-
 llvm/test/CodeGen/AMDGPU/always-uniform.ll    |    3 -
 ...amdgpu-codegenprepare-fold-binop-select.ll |    3 -
 .../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll   |    4 +-
 .../attr-amdgpu-flat-work-group-size.ll       |    4 +-
 .../CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll    |    6 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        |    4 +-
 .../attributor-flatscratchinit-globalisel.ll  |   54 +-
 llvm/test/CodeGen/AMDGPU/attributor-noopt.ll  |    2 +-
 .../callee-special-input-sgprs-fixed-abi.ll   |   36 +-
 llvm/test/CodeGen/AMDGPU/code-object-v3.ll    |   12 +-
 .../CodeGen/AMDGPU/combine-reg-or-const.ll    |    3 -
 ...dagcomb-extract-vec-elt-different-sizes.ll |    2 -
 ...cannot-create-empty-or-backward-segment.ll |    2 +-
 .../expand-scalar-carry-out-select-user.ll    |    3 -
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |   66 -
 .../fast-unaligned-load-store.global.ll       |   20 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |  236 +--
 .../flat-for-global-subtarget-feature.ll      |    7 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll  |   63 +-
 .../AMDGPU/fmul-2-combine-multi-use.ll        |   48 -
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |   60 -
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |    3 -
 llvm/test/CodeGen/AMDGPU/fneg.f16.ll          |   62 +-
 llvm/test/CodeGen/AMDGPU/half.ll              |  231 ---
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |    9 +-
 llvm/test/CodeGen/AMDGPU/hsa.ll               |    4 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |   36 +-
 llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll   |   24 +-
 .../AMDGPU/insert_vector_elt.v2bf16.ll        |   58 +-
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  214 +--
 llvm/test/CodeGen/AMDGPU/kernarg-size.ll      |    2 +-
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |    8 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |   70 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |  114 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   |  126 +-
 llvm/test/CodeGen/AMDGPU/load-constant-f64.ll |    6 -
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll |  127 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |   85 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i64.ll |   18 -
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  |  164 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |  129 +-
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |   83 +-
 llvm/test/CodeGen/AMDGPU/load-select-ptr.ll   |    3 +-
 .../CodeGen/AMDGPU/mad24-get-global-id.ll     |    2 +-
 .../match-perm-extract-vector-elt-bug.ll      |    8 +-
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    |   20 +-
 .../AMDGPU/memory-legalizer-flat-agent.ll     | 1380 -----------------
 .../memory-legalizer-flat-nontemporal.ll      |   75 -
 .../memory-legalizer-flat-singlethread.ll     | 1380 -----------------
 .../AMDGPU/memory-legalizer-flat-system.ll    | 1380 -----------------
 .../AMDGPU/memory-legalizer-flat-volatile.ll  |   66 -
 .../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ----------------
 .../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ----------------
 .../AMDGPU/memory-legalizer-global-agent.ll   |  273 ----
 .../memory-legalizer-global-nontemporal.ll    |   15 -
 .../memory-legalizer-global-singlethread.ll   |  276 ----
 .../AMDGPU/memory-legalizer-global-system.ll  |  261 ----
 .../memory-legalizer-global-volatile.ll       |   18 -
 .../memory-legalizer-global-wavefront.ll      |  276 ----
 .../memory-legalizer-global-workgroup.ll      |  276 ----
 .../memory-legalizer-local-nontemporal.ll     |    9 -
 .../AMDGPU/memory-legalizer-local-volatile.ll |    6 -
 .../memory-legalizer-private-nontemporal.ll   |   59 +-
 .../memory-legalizer-private-volatile.ll      |   30 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |  210 ---
 llvm/test/CodeGen/AMDGPU/pack.v2f16.ll        |   21 -
 llvm/test/CodeGen/AMDGPU/pack.v2i16.ll        |   18 -
 ...al-regcopy-and-spill-missed-at-regalloc.ll |   56 +-
 llvm/test/CodeGen/AMDGPU/sad.ll               |   68 +-
 .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll  |   16 -
 .../scc-clobbered-sgpr-to-vmem-spill.ll       |  456 +++---
 .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll     |    2 +-
 llvm/test/CodeGen/AMDGPU/shift-i128.ll        |   24 +-
 llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll    |   70 +-
 .../CodeGen/AMDGPU/spill-vector-superclass.ll |    6 +-
 llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll |    6 -
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll  |    2 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll  |    2 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll   |    9 +-
 llvm/test/CodeGen/AMDGPU/trap-abis.ll         |   16 +-
 llvm/test/CodeGen/AMDGPU/udiv.ll              |   45 -
 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll    |   91 +-
 .../AMDGPU/vgpr-spill-placement-issue61083.ll |    2 +-
 ...ine-function-info-long-branch-reg-debug.ll |    7 +-
 .../machine-function-info-long-branch-reg.ll  |    7 +-
 97 files changed, 644 insertions(+), 12190 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index cb64c25b5f080e..705bcbddf227a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,14 +20,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -38,14 +35,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -105,14 +99,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -123,14 +114,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -303,9 +291,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -321,9 +306,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -383,9 +365,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -403,9 +382,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -468,9 +444,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -488,9 +461,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -553,9 +523,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -568,9 +535,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -621,9 +585,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -638,9 +599,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -694,9 +652,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -711,9 +666,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -768,9 +720,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -778,7 +728,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -793,9 +742,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -803,7 +750,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -868,9 +814,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -888,9 +831,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -950,9 +890,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -968,9 +905,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -986,8 +920,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1002,10 +934,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1042,9 +970,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -1062,9 +987,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -1082,8 +1004,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1098,10 +1018,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1141,9 +1057,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -1161,9 +1074,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -1181,8 +1091,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1197,10 +1105,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1240,9 +1144,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1255,9 +1156,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1270,8 +1168,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1283,10 +1179,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1319,9 +1211,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -1336,9 +1225,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -1353,8 +1239,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1366,10 +1250,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1405,9 +1285,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -1422,9 +1299,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -1439,8 +1313,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1452,10 +1324,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1492,9 +1360,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1502,7 +1368,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -1517,9 +1382,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1527,7 +1390,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -1542,8 +1404,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1562,10 +1422,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 42
@@ -1622,9 +1478,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1642,9 +1495,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1662,8 +1512,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -1677,10 +1525,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1727,13 +1571,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1751,13 +1592,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1775,9 +1613,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_ret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1792,10 +1628,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1834,15 +1666,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1860,15 +1689,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1886,9 +1712,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1903,10 +1727,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1948,13 +1768,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1964,13 +1781,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1980,9 +1794,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -1994,10 +1806,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2032,15 +1840,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2050,15 +1855,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2068,9 +1870,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -2082,10 +1882,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2123,15 +1919,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2141,15 +1934,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2159,9 +1949,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -2173,10 +1961,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2215,9 +1999,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2244,9 +2025,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2273,14 +2051,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2294,10 +2070,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2356,9 +2128,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2377,9 +2146,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2398,14 +2164,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2414,10 +2178,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2471,11 +2231,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v2 offset:8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    flat_store_dword v[0:1], v3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2492,11 +2249,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v2 offset:8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2572,10 +2326,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -2591,10 +2342,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2662,10 +2410,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -2681,10 +2426,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2870,13 +2612,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2889,13 +2628,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2955,15 +2691,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2976,15 +2709,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3045,15 +2775,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_dec_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3066,15 +2793,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_dec_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3135,13 +2859,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3151,13 +2872,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3208,15 +2926,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3226,15 +2941,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3286,15 +2998,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3304,15 +3013,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_dec_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3365,9 +3071,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3391,9 +3094,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3470,9 +3170,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3491,9 +3188,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3564,10 +3258,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v4, s3
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v3, s2
 ; CI-NEXT:    flat_store_dword v[3:4], v0
@@ -3586,10 +3277,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    flat_store_dword v[3:4], v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 00ff2d7a35d561..b3a7e65f771c43 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -20,14 +20,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -38,14 +35,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -105,14 +99,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -123,14 +114,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -303,9 +291,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -321,9 +306,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -383,9 +365,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -403,9 +382,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -468,9 +444,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -488,9 +461,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -553,9 +523,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_inc_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -568,9 +535,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_inc_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -621,9 +585,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -638,9 +599,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -694,9 +652,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_inc_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -711,9 +666,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_inc_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -768,9 +720,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -778,7 +728,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -793,9 +742,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -803,7 +750,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -868,9 +814,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -888,9 +831,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -956,11 +896,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v2 offset:8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    flat_store_dword v[0:1], v3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -977,11 +914,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v2 offset:8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1057,10 +991,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -1076,10 +1007,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1147,10 +1075,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -1166,10 +1091,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1355,13 +1277,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1374,13 +1293,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1440,15 +1356,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1461,15 +1374,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1530,15 +1440,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1551,15 +1458,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1620,13 +1524,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1636,13 +1537,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1693,15 +1591,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1711,15 +1606,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1771,15 +1663,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1789,15 +1678,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1850,9 +1736,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1876,9 +1759,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1955,9 +1835,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1976,9 +1853,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2042,9 +1916,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -2060,9 +1931,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -2078,8 +1946,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2094,10 +1960,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2134,9 +1996,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -2154,9 +2013,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -2174,8 +2030,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2190,10 +2044,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2233,9 +2083,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -2253,9 +2100,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -2273,8 +2117,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2289,10 +2131,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2332,9 +2170,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2347,9 +2182,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2362,8 +2194,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2375,10 +2205,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2411,9 +2237,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -2428,9 +2251,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -2445,8 +2265,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2458,10 +2276,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2497,9 +2311,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -2514,9 +2325,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -2531,8 +2339,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2544,10 +2350,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2584,9 +2386,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2594,7 +2394,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -2609,9 +2408,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2619,7 +2416,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -2634,8 +2430,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2654,10 +2448,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 42
@@ -2714,9 +2504,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2734,9 +2521,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2754,8 +2538,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -2769,10 +2551,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2826,10 +2604,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v4, s3
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v3, s2
 ; CI-NEXT:    flat_store_dword v[3:4], v0
@@ -2848,10 +2623,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    flat_store_dword v[3:4], v0
@@ -2924,13 +2696,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2948,13 +2717,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2972,9 +2738,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_ret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2989,10 +2753,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3031,15 +2791,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3057,15 +2814,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3083,9 +2837,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -3100,10 +2852,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3145,15 +2893,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3171,15 +2916,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3197,9 +2939,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -3214,10 +2954,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3259,13 +2995,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3275,13 +3008,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3291,9 +3021,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3305,10 +3033,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3343,15 +3067,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3361,15 +3082,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3379,9 +3097,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3393,10 +3109,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3434,15 +3146,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3452,15 +3161,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3470,9 +3176,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3484,10 +3188,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3526,9 +3226,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3555,9 +3252,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3584,14 +3278,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3605,10 +3297,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3667,9 +3355,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3688,9 +3373,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3709,14 +3391,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3725,10 +3405,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3779,7 +3455,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
@@ -3787,8 +3462,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; CI-NEXT:    ds_inc_rtn_u32 v3, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -3802,7 +3475,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
@@ -3810,8 +3482,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; VI-NEXT:    ds_inc_rtn_u32 v3, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 868b530e42a219..34efb089b72bf1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3037,7 +3037,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 12
+; GPRIDX-NEXT:     user_sgpr_count = 10
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3052,7 +3052,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -3069,7 +3069,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 15
+; GPRIDX-NEXT:     wavefront_sgpr_count = 13
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3117,7 +3117,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -3128,7 +3128,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 12
+; MOVREL-NEXT:     user_sgpr_count = 10
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3143,7 +3143,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -3160,7 +3160,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 22
+; MOVREL-NEXT:     wavefront_sgpr_count = 9
 ; MOVREL-NEXT:     workitem_vgpr_count = 4
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -3178,24 +3178,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; MOVREL-NEXT:    s_load_dword s8, s[6:7], 0x8
-; MOVREL-NEXT:    s_add_i32 s10, s10, s15
-; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; MOVREL-NEXT:    s_mov_b32 s4, 0
 ; MOVREL-NEXT:    s_mov_b32 s5, 0x40080000
+; MOVREL-NEXT:    s_mov_b32 s2, 0
+; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
 ; MOVREL-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
-; MOVREL-NEXT:    s_mov_b32 s2, 0
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 3
-; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], 4.0, s[4:5]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 4
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
-; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
 ; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -3223,7 +3220,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 12
+; GFX10-NEXT:     user_sgpr_count = 10
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3238,7 +3235,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4057,7 +4054,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 12
+; GPRIDX-NEXT:     user_sgpr_count = 10
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4072,7 +4069,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4089,7 +4086,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 14
+; GPRIDX-NEXT:     wavefront_sgpr_count = 12
 ; GPRIDX-NEXT:     workitem_vgpr_count = 2
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4130,7 +4127,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 0
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4141,7 +4138,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 12
+; MOVREL-NEXT:     user_sgpr_count = 10
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4156,7 +4153,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4173,7 +4170,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 22
+; MOVREL-NEXT:     wavefront_sgpr_count = 8
 ; MOVREL-NEXT:     workitem_vgpr_count = 3
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -4191,9 +4188,6 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; MOVREL-NEXT:    s_add_i32 s10, s10, s15
-; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s2, 1
 ; MOVREL-NEXT:    s_cselect_b32 s3, 2.0, 1.0
@@ -4229,7 +4223,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 12
+; GFX10-NEXT:     user_sgpr_count = 10
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4244,7 +4238,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4407,7 +4401,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 12
+; GPRIDX-NEXT:     user_sgpr_count = 10
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4422,7 +4416,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4439,7 +4433,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 15
+; GPRIDX-NEXT:     wavefront_sgpr_count = 13
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4483,7 +4477,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4494,7 +4488,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 12
+; MOVREL-NEXT:     user_sgpr_count = 10
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4509,7 +4503,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4526,7 +4520,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 22
+; MOVREL-NEXT:     wavefront_sgpr_count = 9
 ; MOVREL-NEXT:     workitem_vgpr_count = 4
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -4544,12 +4538,10 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dword s8, s[6:7], 0x8
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; MOVREL-NEXT:    s_add_i32 s10, s10, s15
-; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; MOVREL-NEXT:    s_mov_b32 s2, 0
+; MOVREL-NEXT:    s_mov_b32 s3, 0x40080000
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
-; MOVREL-NEXT:    s_mov_b32 s3, 0x40080000
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
@@ -4557,7 +4549,6 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
-; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
 ; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -4585,7 +4576,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 12
+; GFX10-NEXT:     user_sgpr_count = 10
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4600,7 +4591,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 70b889389ff998..9443b39dcdc033 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -12,9 +12,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_mov_b32 s4, s0
 ; GFX8V4-NEXT:    s_mov_b32 s5, s3
@@ -25,7 +23,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], s[6:7], 0
-; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
@@ -40,9 +37,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_mov_b32 s4, s0
 ; GFX8V5-NEXT:    s_mov_b32 s5, s2
@@ -52,7 +47,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], s[2:3], 0
-; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
@@ -66,10 +60,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_mov_b32 s2, s0
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
@@ -78,7 +71,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
-; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
@@ -92,10 +84,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_mov_b32 s2, s0
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
@@ -104,7 +95,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
-; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
@@ -127,9 +117,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
@@ -143,9 +130,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xcc
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
@@ -189,9 +173,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
@@ -205,9 +186,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
@@ -291,10 +269,7 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
 ; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
@@ -320,9 +295,6 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 7aa3b5bb109903..4fcde0f2fc7cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[20:23], s[6:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[24:25], s[6:7], 0x10
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s13
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index 6ed2df430998f8..c3938e673a6da6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,16 +11,13 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4
 ; CHECK-NEXT:    s_mov_b32 m0, -1
-; CHECK-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-NEXT:    ds_read_b32 v2, v0
-; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-NEXT:    v_mov_b32_e32 v3, 9
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_add_u32 s0, s0, 4
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 9
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x200
 ; CHECK-NEXT:    ds_write_b32 v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
index 3a90c3ee908033..d7a82b415ff06c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -6,9 +6,6 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xa
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 0
 ; GCN-NEXT:    s_cselect_b32 s2, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index 96fd14f52d13b0..f3654fea486e0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -42,9 +42,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -62,9 +59,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -82,8 +76,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -93,10 +85,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ;
 ; GFX10-LABEL: s_trig_preop_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x8
@@ -125,9 +113,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; CI-LABEL: s_trig_preop_f64_imm:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; CI-NEXT:    s_add_u32 s0, s0, 4
@@ -143,9 +128,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; VI-LABEL: s_trig_preop_f64_imm:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; VI-NEXT:    s_add_u32 s0, s0, 4
@@ -161,8 +143,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; GFX9-LABEL: s_trig_preop_f64_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
@@ -171,10 +151,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ;
 ; GFX10-LABEL: s_trig_preop_f64_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index f0ec0d101f5bec..7d7f450e590faa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -7,9 +7,6 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s8, s5, 31
 ; GFX8-NEXT:    s_add_i32 s0, s5, s8
@@ -148,9 +145,6 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
 ; GFX8-LABEL: sdivrem_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
@@ -622,9 +616,6 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s2, s10, 31
@@ -854,9 +845,6 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1283,9 +1271,6 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: sdivrem_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2203,9 +2188,6 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
 ; GFX8-LABEL: sdiv_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x80008
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -2351,9 +2333,6 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_v2i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s2, 0x80010
 ; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
@@ -2618,9 +2597,6 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
 ; GFX8-LABEL: sdiv_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x100010
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -2766,9 +2742,6 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-LABEL: sdivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s3
 ; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
@@ -3030,9 +3003,6 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
 ; GFX8-LABEL: sdivrem_i3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x30008
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -3184,9 +3154,6 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_i27:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s5, 0x1b0000
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 2be04ace99e36d..5aef6679347094 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -7,9 +7,6 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: udivrem_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s5
@@ -115,9 +112,6 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
 ; GFX8-LABEL: udivrem_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s11
@@ -528,9 +522,6 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: udivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s10
@@ -694,9 +685,6 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: udivrem_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -991,10 +979,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: udivrem_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
 ; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x20
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
@@ -1787,9 +1772,6 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
 ; GFX8-LABEL: udiv_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x80008
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
@@ -1903,9 +1885,6 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s2, s0, 0x80010
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
@@ -2102,9 +2081,6 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
 ; GFX8-LABEL: udiv_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
@@ -2217,10 +2193,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-LABEL: udivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s2, s1, 0xffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
@@ -2414,9 +2387,6 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
 ; GFX8-LABEL: udivrem_i3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x30008
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
@@ -2535,9 +2505,6 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: udivrem_i27:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0x7ffffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index b8ffa4f14c3e5c..e53653408feb40 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -135,9 +135,6 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr)
 ; FIXEDABI-LABEL: marked_kernel_use_workitem_id:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
-; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v4, s1
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s0
@@ -184,19 +181,16 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
 ; FIXEDABI-LABEL: marked_kernel_use_workgroup_id:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
-; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s6
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v1, s1
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s9
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s7
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s10
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-NEXT:    s_endpgm
@@ -244,9 +238,6 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
-; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; FIXEDABI-NEXT:    s_add_u32 s0, s4, 8
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_addc_u32 s1, s5, 0
@@ -270,10 +261,7 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
 define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
 ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_add_i32 s4, s4, s9
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v0, 0
-; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v1, 0
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 9534561e6e2805..62f6890e92662e 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -69,7 +69,7 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
 ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
 ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
 ; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
@@ -254,8 +254,8 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
 ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
 ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
@@ -279,8 +279,8 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
 
 ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
 ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index e0c69706bad79c..0a461f9ee6c968 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -8,10 +8,8 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
 ; GCN-LABEL: readfirstlane_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s0, s0, s4
@@ -20,7 +18,6 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
 ; GCN-NEXT:    s_add_u32 s0, s2, 40
 ; GCN-NEXT:    s_addc_u32 s1, s3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 89c5303e0e81d1..bf72cccd912cee 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -393,14 +393,11 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; GCN-LABEL: select_add_lhs_const_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x83
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x80
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s0, 0
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    flat_store_short v[0:1], v0
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 54a800ecee9f17..a6d8c6f41eee59 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
 
 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE:  NumSgprs: 83
-; TRAP-HANDLER-DISABLE: NumSgprs: 98
+; TRAP-HANDLER-ENABLE:  NumSgprs: 77
+; TRAP-HANDLER-DISABLE: NumSgprs: 92
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
     ptr addrspace(1) %out0, i32 %in0,
     ptr addrspace(1) %out1, i32 %in1,
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 22cc5af30da662..fc13b86566f76c 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -35,9 +35,9 @@ entry:
 attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 
 ; CHECK-LABEL: {{^}}min_1024_max_1024
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 24{{$}}
+; CHECK: NumSGPRsForWavesPerEU: 2{{$}}
 ; CHECK: NumVGPRsForWavesPerEU: 43
 @var = addrspace(1) global float 0.0
 define amdgpu_kernel void @min_1024_max_1024() #3 {
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index 3ddf8be052e4ab..ed045107d354dc 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -4,8 +4,8 @@
 
 ; ALL-LABEL: {{^}}max_10_sgprs:
 
-; ALL: SGPRBlocks: 2
-; ALL: NumSGPRsForWavesPerEU: 22
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 10
 define amdgpu_kernel void @max_10_sgprs() #0 {
   %one = load volatile i32, ptr addrspace(4) undef
   %two = load volatile i32, ptr addrspace(4) undef
@@ -125,7 +125,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1
 declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1
 declare ptr addrspace(4) @llvm.amdgcn.queue.ptr() #1
 
-attributes #0 = { nounwind "amdgpu-num-sgpr"="18" }
+attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
 attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 26dc3e13a72cf0..14519f5a5e77c0 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 
 ; Exactly 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}exactly_10:
-; CHECK: SGPRBlocks: 3
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 26
+; CHECK: NumSGPRsForWavesPerEU: 20
 ; CHECK: NumVGPRsForWavesPerEU: 24
 define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, ptr addrspace(1) @var
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index f9f48aacfc2bf7..b7503f26b1ab6d 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -503,38 +503,31 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ;
 ; GFX10: name:            with_alloca_cc_vs
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_gs
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
 ; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
 ;
 ; GFX10: name:            with_alloca_cc_ps
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_cs
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_hs
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
 ; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
 ;
 ; GFX10: name:            with_alloca_cc_ls
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_es
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_gfx
 ; GFX10:       argumentInfo:
@@ -701,8 +694,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
 ;
 ; GFX10: name:            without_region_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -730,8 +723,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            without_group_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -759,8 +751,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            without_constant_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -788,8 +779,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -817,8 +807,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -846,8 +835,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_both_with_and_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -861,8 +849,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_call_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -890,8 +877,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_call_both_with_and_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -905,8 +891,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            with_cast_call_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -920,8 +905,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            with_cast_call_with_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -935,8 +919,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_without_alloca_and_without_addrspacecast
 ; GFX10:       argumentInfo:
@@ -964,8 +947,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            with_indirect_call
 ; GFX10:       argumentInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 470c444ad8cd74..90562e25a3e9c1 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -23,7 +23,7 @@
 ; COV5: .amdhsa_user_sgpr_queue_ptr 0
 ; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
-; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1
+; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
 ; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 25a5207d3bbb7c..8ef2d89e76d4e1 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -198,11 +198,11 @@ define hidden void @use_workgroup_id_yz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
 ; GCN-NOT: s6
-; GCN: s_mov_b32 s12, s4
+; GCN: s_mov_b32 s12, s6
 ; GCN: s_mov_b32 s32, 0
-; GCN: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x at rel32@hi+12
+; GCN: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x at rel32@hi+12
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 
@@ -216,7 +216,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
 ; GCN-NOT: s12
-; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s13, s7
 ; GCN-NOT: s12
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -232,7 +232,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
 ; GCN-NOT: s12
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s14, s5
+; GCN: s_mov_b32 s14, s7
 ; GCN-NOT: s12
 ; GCN-NOT: s13
 
@@ -249,8 +249,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
 ; GCN-NOT: s14
-; GCN: s_mov_b32 s12, s4
-; GCN-NEXT: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s12, s6
+; GCN-NEXT: s_mov_b32 s13, s7
 ; GCN-NOT: s14
 
 ; GCN: s_mov_b32 s32, 0
@@ -265,9 +265,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
-; GCN: s_mov_b32 s12, s4
-; GCN: s_mov_b32 s13, s5
-; GCN: s_mov_b32 s14, s6
+; GCN: s_mov_b32 s12, s6
+; GCN: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s14, s8
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 
@@ -282,8 +282,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
 
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s12, s4
-; GCN-NEXT: s_mov_b32 s14, s5
+; GCN: s_mov_b32 s12, s6
+; GCN-NEXT: s_mov_b32 s14, s7
 ; GCN-NOT: s13
 
 ; GCN: s_mov_b32 s32, 0
@@ -299,8 +299,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
 
-; GCN: s_mov_b32 s13, s5
-; GCN: s_mov_b32 s14, s6
+; GCN: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s14, s8
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -381,7 +381,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
 
 ; GCN-NOT: s13
 ; GCN-NOT: s14
-; GCN-DAG: s_mov_b32 s12, s4
+; GCN-DAG: s_mov_b32 s12, s6
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
 ; GCN-NOT: s13
 ; GCN-NOT: s14
@@ -399,7 +399,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s13, s5
+; GCN-DAG: s_mov_b32 s13, s7
 
 ; GCN-DAG: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -414,7 +414,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s14, s5
+; GCN-DAG: s_mov_b32 s14, s7
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 48ad2fe687804c..3035a8579c8a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -12,13 +12,13 @@
 ; OSABI-AMDHSA-ASM: .section .rodata,"a"
 ; OSABI-AMDHSA-ASM: .p2align 6
 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd
-; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 12
+; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 1
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
@@ -31,13 +31,13 @@
 ; OSABI-AMDHSA-ASM: .section .rodata,"a"
 ; OSABI-AMDHSA-ASM: .p2align 6
 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub
-; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 12
+; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 1
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index c97d3338006022..5fbcd0bf669995 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -5,9 +5,6 @@
 define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
 ; CHECK-LABEL: _Z11test_kernelPii:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 3
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index f586f6d16e0efd..297fe7618672e6 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -6,8 +6,6 @@
 define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) {
 ; CHECK-LABEL: eggs:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x8
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 39328d706ba260..85ed2914b8c7f5 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dword s14, s[6:7], 0x4
-; CHECK-NEXT:    s_add_u32 s24, s24, s15
+; CHECK-NEXT:    s_add_u32 s24, s24, s13
 ; CHECK-NEXT:    s_addc_u32 s25, s25, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp1_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 05a245cd3443c4..54fb1dc5c05274 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -94,9 +94,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
 ; GFX7-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_add_i32 s0, s2, s2
 ; GFX7-NEXT:    s_cmp_lt_u32 s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 7fde702bd49ac8..21799ab79b8396 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -13,9 +13,6 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -28,9 +25,6 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -74,9 +68,6 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -89,9 +80,6 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -134,9 +122,6 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -149,9 +134,6 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -193,9 +175,6 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; CI-LABEL: s_fabs_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
@@ -209,9 +188,6 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; VI-LABEL: s_fabs_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
@@ -256,9 +232,6 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
 ; CI-LABEL: fabs_fold_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -276,9 +249,6 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
@@ -325,9 +295,6 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -342,9 +309,6 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -394,9 +358,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -409,9 +370,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -457,9 +415,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -486,9 +441,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -547,12 +499,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v0, v[0:1]
 ; CI-NEXT:    s_lshr_b32 s2, s4, 16
@@ -578,12 +527,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -643,9 +589,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -669,9 +612,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -742,9 +682,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -763,9 +700,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 4df6b8d0669158..7252c69cb1cf75 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -74,9 +74,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -93,9 +90,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -226,10 +220,8 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -239,7 +231,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v2
 ; GFX7-ALIGNED-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s0, s0, 3
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v3
@@ -256,9 +247,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -371,9 +359,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -384,9 +369,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 6482749bd2fb7e..f0ce96af90649d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -24,9 +24,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
 ; GFX678-LABEL: v_test_canonicalize_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -83,9 +80,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -97,9 +91,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -149,9 +140,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -208,9 +196,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -268,9 +253,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -327,9 +309,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: test_fold_canonicalize_undef_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -373,9 +352,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -419,9 +395,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -468,9 +441,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -515,9 +485,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, -1.0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -562,9 +529,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
 ; GFX678-LABEL: test_fold_canonicalize_literal_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x41800000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -609,9 +573,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -655,13 +616,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -706,13 +664,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -757,13 +712,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -808,9 +760,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -855,9 +804,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -904,9 +850,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x807fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -951,9 +894,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
 ; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -998,9 +938,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1045,9 +982,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1092,9 +1026,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1139,9 +1070,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1186,9 +1114,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1233,9 +1158,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1280,9 +1202,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
 ; GFX678-LABEL: v_test_canonicalize_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1338,9 +1257,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX6-LABEL: s_test_canonicalize_var_f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_max_f64 v[2:3], s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -1351,9 +1267,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX8-LABEL: s_test_canonicalize_var_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -1400,9 +1313,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1459,9 +1369,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1519,9 +1426,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1578,13 +1482,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1630,13 +1531,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1680,13 +1578,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1728,13 +1623,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1776,13 +1668,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
 ; GFX678-LABEL: test_fold_canonicalize_literal_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1824,13 +1713,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1876,13 +1762,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, -1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1927,13 +1810,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1977,13 +1857,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, -1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2028,13 +1905,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
 ; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2076,13 +1950,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2124,13 +1995,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2172,13 +2040,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2220,13 +2085,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2268,13 +2130,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2316,13 +2175,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2365,9 +2221,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2385,9 +2238,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2455,9 +2305,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2475,9 +2322,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2545,9 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2566,9 +2407,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2637,9 +2475,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2663,9 +2498,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2736,9 +2568,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2756,9 +2585,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2826,9 +2652,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2846,9 +2669,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2917,9 +2737,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2938,9 +2755,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -3010,9 +2824,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -3036,9 +2847,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -3108,9 +2916,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -3128,9 +2933,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index ab00b132f2fd1a..fee6540f43c64c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -6,13 +6,18 @@
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga | FileCheck -check-prefix=NOHSA-NOADDR64 -check-prefix=ALL %s
 
 
+; There are no stack objects even though flat is used by default, so
+; flat_scratch_init should be disabled.
+
 ; ALL-LABEL: {{^}}test:
 
+; ALL-NOT: flat_scr
+
 ; HSA-DEFAULT: flat_store_dword
 ; HSA-NODEFAULT: buffer_store_dword
 ; HSA-NOADDR64: flat_store_dword
 
-; HSA: .amdhsa_user_sgpr_flat_scratch_init 1
+; HSA: .amdhsa_user_sgpr_flat_scratch_init 0
 
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 134b7902380862..e4ffedd686ac93 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefix=CI -check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=FIJI-NOXNACK -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
@@ -8,16 +8,16 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 
@@ -28,12 +28,8 @@
 ; CI: ; NumSgprs: 8
 ; VI-NOXNACK: ; NumSgprs: 8
 ; VI-XNACK: ; NumSgprs: 12
-; HSA-VI-NOXNACK: ; NumSgprs: 8
-; HSA-VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8
+; GFX9-ARCH-FLAT: ; NumSgprs: 14
+; GFX10-ARCH-FLAT: ; NumSgprs: 8
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -49,12 +45,8 @@ entry:
 ; CI: ; NumSgprs: 10
 ; VI-NOXNACK: ; NumSgprs: 10
 ; VI-XNACK: ; NumSgprs: 12
-; HSA-VI-NOXNACK: ; NumSgprs: 10
-; HSA-VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10
+; GFX9-ARCH-FLAT: ; NumSgprs: 14
+; GFX10-ARCH-FLAT: ; NumSgprs: 10
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -68,15 +60,10 @@ entry:
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
 ; CI: ; NumSgprs: 12
-; FIJI-NOXNACK: ; NumSgprs: 14
 ; VI-NOXNACK: ; NumSgprs: 14
 ; VI-XNACK: ; NumSgprs: 14
-; HSA-VI-NOXNACK: ; NumSgprs: 24
-; HSA-VI-XNACK: ; NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8
+; GFX9-ARCH-FLAT: ; NumSgprs: 14
+; GFX10-ARCH-FLAT: ; NumSgprs: 8
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -92,10 +79,8 @@ entry:
 ; CI: ; NumSgprs: 12
 ; VI-NOXNACK: ; NumSgprs: 14
 ; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10
+; GFX9-ARCH-FLAT: ; NumSgprs: 14
+; GFX10-ARCH-FLAT: ; NumSgprs: 10
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -114,10 +99,8 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
+; GFX9-ARCH-FLAT: ; NumSgprs: 6
+; GFX10-ARCH-FLAT: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -133,10 +116,8 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
+; GFX9-ARCH-FLAT: ; NumSgprs: 6
+; GFX10-ARCH-FLAT: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -152,10 +133,8 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
+; GFX9-ARCH-FLAT: ; NumSgprs: 6
+; GFX10-ARCH-FLAT: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 23b453438f3a0f..c60b9858abd836 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -16,9 +16,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
 ; VI-LABEL: multiple_fadd_use_test_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_add_f32_e64 v0, s3, -1.0
 ; VI-NEXT:    v_add_f32_e64 v1, s2, -1.0
@@ -85,11 +82,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x2c
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_add_u32 s2, s0, 4
 ; VI-NEXT:    v_add_f32_e64 v2, s4, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -149,9 +143,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
 ; VI-LABEL: multiple_use_fadd_fmad_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_add_u32 s4, s0, 4
@@ -209,9 +200,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s6, s4, 4
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
@@ -275,9 +263,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e64 v0, s2, -4.0
 ; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
@@ -328,13 +313,10 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc0c00000
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -380,9 +362,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
-; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-DENORM-NEXT:    v_add_f16_e64 v0, s2, -1.0
@@ -401,9 +380,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
-; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-FLUSH-NEXT:    v_add_f16_e64 v0, s2, -1.0
@@ -522,9 +498,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
-; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s3
@@ -546,9 +519,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
-; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
@@ -649,9 +619,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
-; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s3
@@ -673,9 +640,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
-; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s3
@@ -778,8 +742,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; VI-DENORM-NEXT:    s_load_dword s6, s[6:7], 0x8
-; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
-; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s0, s0, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
@@ -787,7 +749,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-DENORM-NEXT:    v_fma_f16 v3, |s6|, 2.0, v0
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_add_u32 s4, s2, 2
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-DENORM-NEXT:    s_addc_u32 s5, s3, 0
@@ -804,8 +765,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; VI-FLUSH-NEXT:    s_load_dword s6, s[6:7], 0x8
-; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
-; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s0, s0, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
@@ -813,7 +772,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-FLUSH-NEXT:    v_mad_f16 v3, |s6|, 2.0, v0
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_add_u32 s4, s2, 2
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-FLUSH-NEXT:    s_addc_u32 s5, s3, 0
@@ -917,9 +875,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, s2, -4.0
 ; VI-NEXT:    v_mul_f16_e32 v2, s2, v0
@@ -973,13 +928,10 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc600
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v0, s2, v0
 ; VI-NEXT:    v_mul_f16_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index c4f13749251a4c..8267bb9f5450f8 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -8,9 +8,6 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
 ; CI-LABEL: fneg_fabs_fadd_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -28,9 +25,6 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -77,9 +71,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
 ; CI-LABEL: fneg_fabs_fmul_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s1, s0, 0x7fff
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -98,9 +89,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -151,9 +139,6 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -166,9 +151,6 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitset1_b32 s2, 15
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -213,9 +195,6 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -228,9 +207,6 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitset1_b32 s2, 15
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -273,9 +249,6 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CIVI-LABEL: v_fneg_fabs_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -321,9 +294,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s1
@@ -346,9 +316,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x4000
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -357,7 +325,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -402,9 +369,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -417,9 +381,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -462,9 +423,6 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
 ; CIVI-LABEL: fneg_fabs_v4f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_or_b32 s3, s3, 0x80008000
 ; CIVI-NEXT:    s_or_b32 s2, s2, 0x80008000
@@ -510,9 +468,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; CI-LABEL: fold_user_fneg_fabs_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s1|
@@ -534,9 +489,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc400
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -544,7 +497,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; VI-NEXT:    v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -586,9 +538,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
@@ -606,9 +555,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
@@ -665,9 +611,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_bfe_u32 s0, s4, 0xf0010
@@ -692,9 +635,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xc400
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_lshr_b32 s1, s4, 16
@@ -703,7 +644,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; VI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
 ; VI-NEXT:    v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e64 v5, |s4|, -4.0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; VI-NEXT:    v_mov_b32_e32 v5, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 7137834b6552e0..98b17bbaa0a959 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1478,8 +1478,6 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x6
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bitcmp1_b32 s8, 0
 ; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1491,7 +1489,6 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 6e22c9c319f698..40982347f3ca00 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -10,9 +10,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -25,9 +22,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -72,9 +66,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -89,9 +80,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -141,9 +129,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -156,9 +141,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -201,9 +183,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CI-LABEL: v_fneg_fold_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -221,9 +200,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-LABEL: v_fneg_fold_f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -270,9 +246,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -285,9 +258,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -329,17 +299,14 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
 ; CIVI-LABEL: s_fneg_v2f16_nonload:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
 ; CIVI-NEXT:    ;;#ASMSTART
 ; CIVI-NEXT:    ; def s2
 ; CIVI-NEXT:    ;;#ASMEND
 ; CIVI-NEXT:    s_xor_b32 s2, s2, 0x80008000
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
-; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    flat_store_dword v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
@@ -382,9 +349,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -399,9 +363,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -451,9 +412,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -466,9 +424,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -511,9 +466,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; CI-LABEL: v_fneg_fold_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -540,9 +492,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; GFX8-LABEL: v_fneg_fold_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -590,9 +539,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; CI-LABEL: v_extract_fneg_fold_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -614,9 +560,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; GFX8-LABEL: v_extract_fneg_fold_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -681,9 +624,6 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 6207c442f41eeb..3735c6349fbb31 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -10,9 +10,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -24,9 +21,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -54,9 +48,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -68,9 +59,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -97,9 +85,6 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
 ; CIVI-LABEL: load_v3f16_arg:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_add_u32 s4, s0, 4
 ; CIVI-NEXT:    s_addc_u32 s5, s1, 0
@@ -135,9 +120,6 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ; CIVI-LABEL: load_v4f16_arg:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v2, s2
@@ -165,9 +147,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v4, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -182,9 +161,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -217,9 +193,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s3, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -233,9 +206,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -269,9 +239,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -283,9 +250,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -315,9 +279,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s3, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -331,9 +292,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -366,9 +324,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; CI-LABEL: extload_v3f16_to_v3f32_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
@@ -382,9 +337,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; VI-LABEL: extload_v3f16_to_v3f32_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
@@ -417,9 +369,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; CI-LABEL: extload_v4f16_to_v4f32_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s3, 16
 ; CI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -435,9 +384,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; VI-LABEL: extload_v4f16_to_v4f32_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -475,9 +421,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s6, s1, 16
 ; CI-NEXT:    s_lshr_b32 s7, s0, 16
@@ -506,9 +449,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s6, s1, 16
 ; VI-NEXT:    s_lshr_b32 s7, s0, 16
@@ -567,9 +507,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
 ; CI-LABEL: extload_f16_to_f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -583,9 +520,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
 ; VI-LABEL: extload_f16_to_f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -619,9 +553,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
 ; CI-LABEL: extload_v2f16_to_v2f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s1
@@ -638,9 +569,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
 ; VI-LABEL: extload_v2f16_to_v2f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s1
@@ -680,9 +608,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; CI-LABEL: extload_v3f16_to_v3f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
@@ -704,9 +629,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; VI-LABEL: extload_v3f16_to_v3f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
@@ -754,9 +676,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; CI-LABEL: extload_v4f16_to_v4f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
@@ -782,9 +701,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; VI-LABEL: extload_v4f16_to_v4f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s5, s3, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s3
@@ -840,9 +756,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s6, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s6
@@ -890,9 +803,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s6, s0, 16
 ; VI-NEXT:    s_lshr_b32 s8, s2, 16
@@ -980,9 +890,6 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
 ; CIVI-LABEL: global_load_store_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1013,9 +920,6 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: global_load_store_v2f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1046,9 +950,6 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add
 ; CIVI-LABEL: global_load_store_v4f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1079,9 +980,6 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: global_load_store_v8f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1112,9 +1010,6 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
 ; CIVI-LABEL: global_extload_f16_to_f32:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1148,9 +1043,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v2f16_to_v2f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1167,9 +1059,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v2f16_to_v2f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1207,9 +1096,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v3f16_to_v3f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1227,9 +1113,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v3f16_to_v3f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1269,9 +1152,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v4f16_to_v4f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1291,9 +1171,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v4f16_to_v4f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1336,9 +1213,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v8f16_to_v8f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1369,9 +1243,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v8f16_to_v8f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1430,9 +1301,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; CI-LABEL: global_extload_v16f16_to_v16f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s4, s2, 16
 ; CI-NEXT:    s_addc_u32 s5, s3, 0
@@ -1491,9 +1359,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; VI-LABEL: global_extload_v16f16_to_v16f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1593,9 +1458,6 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
 ; CIVI-LABEL: global_extload_f16_to_f64:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1632,9 +1494,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v2f16_to_v2f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1653,9 +1512,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v2f16_to_v2f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1698,9 +1554,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v3f16_to_v3f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1726,9 +1579,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v3f16_to_v3f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1782,9 +1632,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v4f16_to_v4f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1813,9 +1660,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v4f16_to_v4f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1875,9 +1719,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v8f16_to_v8f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1926,9 +1767,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v8f16_to_v8f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2015,9 +1853,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; CI-LABEL: global_extload_v16f16_to_v16f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2113,9 +1948,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; VI-LABEL: global_extload_v16f16_to_v16f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2271,9 +2103,6 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
 ; CIVI-LABEL: global_truncstore_f32_to_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2307,9 +2136,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v2f32_to_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2327,9 +2153,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v2f32_to_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2368,9 +2191,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v3f32_to_v3f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2394,9 +2214,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v3f32_to_v3f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2444,9 +2261,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v4f32_to_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2468,9 +2282,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v4f32_to_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2515,9 +2326,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v8f32_to_v8f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2553,9 +2361,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v8f32_to_v8f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2621,9 +2426,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; CI-LABEL: global_truncstore_v16f32_to_v16f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s4, s2, 32
 ; CI-NEXT:    s_addc_u32 s5, s3, 0
@@ -2693,9 +2495,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; VI-LABEL: global_truncstore_v16f32_to_v16f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s4, s2, 32
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
@@ -2808,9 +2607,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; CI-LABEL: fadd_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -2828,9 +2624,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
@@ -2863,9 +2656,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ; CI-LABEL: fadd_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
@@ -2887,9 +2677,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ; VI-LABEL: fadd_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -2923,9 +2710,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-LABEL: fadd_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2963,9 +2747,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-LABEL: fadd_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3006,9 +2787,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 {
 ; CI-LABEL: fadd_v8f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3067,9 +2845,6 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
 ;
 ; VI-LABEL: fadd_v8f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3134,9 +2909,6 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
 ; CIVI-LABEL: test_bitcast_from_half:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3168,9 +2940,6 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
 ; CIVI-LABEL: test_bitcast_to_half:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 51b0e2b86cdf39..8c017fa5ec2636 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,10 +15,7 @@
 ; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
-; GFX700:     .sgpr_count:     22
-; GFX803:     .sgpr_count:     24
-; GFX900:     .sgpr_count:     10
-; GFX1010:    .sgpr_count:     10
+; CHECK:     .sgpr_count:     10
 ; CHECK:     .symbol:         test.kd
 ; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64
@@ -51,8 +48,8 @@ entry:
 
 ; CHECK:   .name:       num_spilled_sgprs
 ; GFX700:   .sgpr_spill_count: 10
-; GFX803:   .sgpr_spill_count: 0
-; GFX900:   .sgpr_spill_count: 0
+; GFX803:   .sgpr_spill_count: 10
+; GFX900:   .sgpr_spill_count: 62
 ; GFX1010:  .sgpr_spill_count: 60
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
 define amdgpu_kernel void @num_spilled_sgprs(
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 2c38e201d326f6..37476203fbfad6 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -43,7 +43,7 @@
 ; ELF:   00E0: 6E616D65 A673696D 706C65BB 2E707269
 ; ELF:   00F0: 76617465 5F736567 6D656E74 5F666978
 ; ELF:   0100: 65645F73 697A6500 AB2E7367 70725F63
-; ELF:   0110: 6F756E74 0EB12E73 6770725F 7370696C
+; ELF:   0110: 6F756E74 06B12E73 6770725F 7370696C
 ; ELF:   0120: 6C5F636F 756E7400 A72E7379 6D626F6C
 ; ELF:   0130: A973696D 706C652E 6B64AB2E 76677072
 ; ELF:   0140: 5F636F75 6E7403B1 2E766770 725F7370
@@ -59,7 +59,7 @@
 ; ELF:   01E0: 73696D70 6C655F6E 6F5F6B65 726E6172
 ; ELF:   01F0: 6773BB2E 70726976 6174655F 7365676D
 ; ELF:   0200: 656E745F 66697865 645F7369 7A6500AB
-; ELF:   0210: 2E736770 725F636F 756E740C B12E7367
+; ELF:   0210: 2E736770 725F636F 756E7400 B12E7367
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 8f9b223c361d60..1b12f668e207cd 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -12,9 +12,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V4-NEXT:    s_cselect_b32 s3, s3, 0
@@ -24,7 +22,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8V4-NEXT:    s_cselect_b32 s0, s2, 0
 ; GFX8V4-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v4
@@ -38,9 +35,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V5-NEXT:    s_cselect_b32 s2, s2, 0
@@ -50,7 +45,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8V5-NEXT:    s_cselect_b32 s0, s3, 0
 ; GFX8V5-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v4
@@ -63,10 +57,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V4-NEXT:    s_cselect_b32 s2, s3, 0
@@ -76,7 +69,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V4-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V4-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v4
@@ -89,10 +81,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V5-NEXT:    s_cselect_b32 s2, s3, 0
@@ -102,7 +93,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V5-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V5-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v4
@@ -124,9 +114,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -139,9 +126,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xcc
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -184,9 +168,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -199,9 +180,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xc8
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -285,10 +263,7 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
 ; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
@@ -313,9 +288,6 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index ff5c71b65f34bd..4d62d30a38ed34 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %12
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
   ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %10
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %9
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %9
   ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
@@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %12
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %12
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %11
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %11
   ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %10
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %9
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9
   ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
@@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() {
 define amdgpu_kernel void @a_input_output_i128() {
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %12
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %12
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %11
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %11
   ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %10
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %9
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9
   ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index cf6ae3d5b4f684..c68138acc9b2bf 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -22,9 +22,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; VI-LABEL: s_insertelement_v2bf16_0:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -85,9 +82,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; VI-LABEL: s_insertelement_v2bf16_1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -150,9 +144,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -225,9 +216,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -298,9 +286,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -373,9 +358,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -453,14 +435,11 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -551,17 +530,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
@@ -633,17 +609,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
@@ -713,17 +686,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -795,17 +765,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
@@ -881,12 +848,9 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -978,12 +942,9 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
@@ -1097,12 +1058,9 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -1279,14 +1237,11 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -1454,14 +1409,11 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index aafa4a04a00de9..647870f0e08979 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -20,9 +20,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2i16_0:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -72,9 +69,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -91,9 +85,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -149,9 +140,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -172,9 +160,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -237,9 +222,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -255,9 +237,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -314,9 +293,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -336,9 +312,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -409,9 +382,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -434,9 +404,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -508,9 +475,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2i16_1:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -559,9 +523,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -578,9 +539,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -631,9 +589,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2f16_0:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -681,9 +636,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2f16_1:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -732,9 +684,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -753,9 +702,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -814,12 +760,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -836,12 +779,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -898,9 +838,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -919,9 +856,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -979,9 +913,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1000,9 +931,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1059,9 +987,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1080,9 +1005,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1139,9 +1061,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1160,9 +1079,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1219,9 +1135,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1240,9 +1153,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1299,9 +1209,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1320,9 +1227,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1379,9 +1283,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1400,9 +1301,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1465,9 +1363,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1487,9 +1382,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1554,12 +1446,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -1578,12 +1467,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -1649,14 +1535,11 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1679,14 +1562,11 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1757,17 +1637,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
@@ -1779,12 +1656,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1844,17 +1718,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
@@ -1866,12 +1737,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1932,17 +1800,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -1954,12 +1819,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2019,17 +1881,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
@@ -2041,12 +1900,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2107,17 +1963,14 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -2129,12 +1982,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2200,9 +2050,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    flat_load_dword v4, v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
@@ -2230,9 +2077,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    flat_load_dword v4, v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
@@ -2316,12 +2160,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2345,12 +2186,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2420,12 +2258,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
@@ -2443,12 +2278,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2509,12 +2341,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2532,12 +2361,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2629,12 +2455,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2686,12 +2509,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2836,14 +2656,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2866,12 +2683,9 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s3
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, s2, v8
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
@@ -2947,14 +2761,12 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2962,7 +2774,6 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
-; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v3, s4, v3, v12
@@ -2976,14 +2787,11 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -3120,14 +2928,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -3220,14 +3025,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[7:10], v[2:3]
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index 1a32953305bbc3..496a1c652da251 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -7,7 +7,7 @@ declare void @llvm.trap() #0
 ; DOORBELL-NEXT:     .amdhsa_group_segment_fixed_size 0
 ; DOORBELL-NEXT:     .amdhsa_private_segment_fixed_size 0
 ; DOORBELL-NEXT:     .amdhsa_kernarg_size 8
-; DOORBELL-NEXT:     .amdhsa_user_sgpr_count 14
+; DOORBELL-NEXT:     .amdhsa_user_sgpr_count 12
 ; DOORBELL-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; DOORBELL:      .end_amdhsa_kernel
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index c998a007277933..c201f84cac7268 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -23,11 +23,8 @@ define void @function_lds_id(ptr addrspace(1) %out) {
 define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: kernel_lds_id:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s2, s12, 42
+; GCN-NEXT:    s_add_i32 s2, s10, 42
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -75,9 +72,6 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: doesnt_use_it:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x64
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 44ea414dd4b93c..39a3b1c8adc9f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -259,9 +259,6 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -272,9 +269,6 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -290,13 +284,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -305,13 +296,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -324,13 +312,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -339,14 +324,11 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -359,15 +341,12 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_m0:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -375,15 +354,12 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
 ; CHECK-GISEL-LABEL: test_readfirstlane_m0:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -397,31 +373,25 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -435,16 +405,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -452,16 +419,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -475,16 +439,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -492,16 +453,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index ffceac2c912bb6..24a332fa211c15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -179,9 +179,6 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -192,9 +189,6 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -210,13 +204,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -225,13 +216,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -244,13 +232,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -259,14 +244,11 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -280,9 +262,6 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -302,9 +281,6 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -335,9 +311,6 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -359,9 +332,6 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -395,9 +365,6 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -419,9 +386,6 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -455,15 +419,12 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
 ; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -471,15 +432,12 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
 ; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -496,14 +454,11 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
@@ -513,13 +468,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; def v0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -533,17 +485,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v[0:1]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -556,13 +505,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ; def v[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -577,17 +523,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v[0:1]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -600,13 +543,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ; def v[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -621,31 +561,25 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -659,16 +593,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -676,16 +607,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -699,16 +627,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -716,16 +641,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index da40a06c306b98..9d93ca65683c42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -15,9 +15,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -58,9 +55,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -108,9 +102,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s6, s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
@@ -162,9 +153,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[6:7], 0x10
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
@@ -222,9 +210,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s6, s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
@@ -276,9 +261,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[6:7], 0x10
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
@@ -336,9 +318,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s2, s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -383,9 +362,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s2, s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -436,9 +412,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -489,9 +462,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -548,14 +518,11 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
@@ -606,14 +573,11 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -669,9 +633,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -733,9 +694,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -808,9 +766,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -878,9 +833,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -957,9 +909,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -968,7 +918,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX802-SDAG-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
@@ -1031,9 +980,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -1043,7 +990,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1118,18 +1064,15 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-SDAG-NEXT:    ;;#ASMSTART
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX802-SDAG-NEXT:    ;;#ASMEND
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    s_mov_b32 s4, m0
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
@@ -1176,18 +1119,15 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-GISEL-NEXT:    ;;#ASMSTART
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, -1
 ; GFX802-GISEL-NEXT:    ;;#ASMEND
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    s_mov_b32 s4, m0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
@@ -1238,9 +1178,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s2, s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -1285,9 +1222,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s2, s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1337,9 +1271,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-SDAG-LABEL: test_writelane_imm_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -1385,9 +1316,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-GISEL-LABEL: test_writelane_imm_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1439,9 +1367,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG-LABEL: test_writelane_imm_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -1487,9 +1412,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL-LABEL: test_writelane_imm_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1542,9 +1464,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
@@ -1584,9 +1503,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
@@ -1632,13 +1548,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
@@ -1683,14 +1596,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1739,13 +1649,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
@@ -1790,14 +1697,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1844,10 +1748,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, 42
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -1881,10 +1782,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, 42
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1924,14 +1822,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 42
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
@@ -1972,14 +1867,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2025,14 +1917,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
@@ -2073,14 +1962,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 59cc6dfac12005..7202ab8b314669 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -22,9 +22,6 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-HSA-LABEL: constant_load_f64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -96,9 +93,6 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
 ; GFX7-HSA-LABEL: constant_load_2v4f64:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[20:21], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 5d69aa7d679bed..2ee1c60b4bbf2d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -26,9 +26,6 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: constant_load_i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -111,9 +108,6 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v2i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -187,9 +181,6 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v3i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 4
@@ -296,9 +287,6 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v4i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -375,9 +363,6 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v8i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -464,9 +449,6 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GCN-HSA-LABEL: constant_load_v16i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -609,9 +591,6 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GCN-HSA-LABEL: constant_load_v16i16_align2:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
@@ -825,9 +804,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_zextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -905,9 +881,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_sextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -986,9 +959,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1066,9 +1036,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1145,9 +1112,6 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1235,9 +1199,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1329,9 +1290,6 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1431,9 +1389,6 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1538,9 +1493,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1649,9 +1601,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1771,9 +1720,6 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1934,9 +1880,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2116,9 +2059,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2383,9 +2323,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2694,9 +2631,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3178,9 +3112,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3752,9 +3683,6 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4678,9 +4606,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5469,9 +5394,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_zextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5560,9 +5482,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_sextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5650,9 +5569,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5736,9 +5652,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5825,15 +5738,12 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s0, s2, 16
 ; GCN-HSA-NEXT:    s_and_b32 s1, s2, 0xffff
@@ -5924,9 +5834,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -6032,13 +5939,10 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s4, s3, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s5, s2, 16
@@ -6172,9 +6076,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6333,13 +6234,10 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s8, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s2, s7, 16
@@ -6556,9 +6454,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6822,13 +6717,10 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s7, 16
@@ -7212,9 +7104,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7693,13 +7582,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s19, s1, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s20, s3, 16
@@ -8424,9 +8310,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 6283f6bb3c5e38..4ab55164e09992 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -23,9 +23,6 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-HSA-LABEL: constant_load_i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -108,9 +105,6 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v2i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -200,9 +194,6 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -299,9 +290,6 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v4i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -402,9 +390,6 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-HSA-LABEL: constant_load_v8i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -541,9 +526,6 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-HSA-LABEL: constant_load_v9i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s12, s[10:11], 0x8
@@ -707,9 +689,6 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX7-HSA-LABEL: constant_load_v10i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[12:13], s[10:11], 0x8
@@ -882,9 +861,6 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v11i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[12:15], s[2:3], 0x8
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
@@ -1062,9 +1038,6 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX7-HSA-LABEL: constant_load_v12i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[12:15], s[10:11], 0x8
@@ -1248,9 +1221,6 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v16i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s18, s16, 48
@@ -1439,9 +1409,6 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1528,9 +1495,6 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1623,9 +1587,6 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1712,9 +1673,6 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1809,15 +1767,12 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s3
@@ -1912,9 +1867,6 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -2031,16 +1983,13 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GFX7-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s6
@@ -2169,9 +2118,6 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2336,10 +2282,8 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 48
@@ -2347,7 +2291,6 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s11
@@ -2548,9 +2491,6 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -2849,9 +2789,6 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3308,16 +3245,13 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s18, s16, 0x70
 ; GFX7-HSA-NEXT:    s_addc_u32 s19, s17, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s18
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s19
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s14
@@ -3747,9 +3681,6 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4608,17 +4539,14 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s34, s36, 0xf0
 ; GFX7-HSA-NEXT:    s_addc_u32 s35, s37, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s30
@@ -5232,9 +5160,6 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 45f0af8d423b67..46c7c2b08cd64b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -22,9 +22,6 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-LABEL: constant_load_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -100,9 +97,6 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v2i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s0
@@ -189,9 +183,6 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v3i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
@@ -308,9 +299,6 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-LABEL: constant_load_v4i64:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -442,9 +430,6 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v8i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-NEXT:    s_add_u32 s18, s16, 48
@@ -664,9 +649,6 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
 ; GFX7-LABEL: constant_load_v16i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 43b79973187a6b..ce17c81a24dd50 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -27,9 +27,6 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
 ; GFX7-HSA-LABEL: constant_load_i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -117,9 +114,6 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v2i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -205,9 +199,6 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v3i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -320,9 +311,6 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v4i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -394,9 +382,6 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v8i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -473,9 +458,6 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v16i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -559,9 +541,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -639,9 +618,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -720,9 +696,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -800,9 +773,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -884,9 +854,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -988,9 +955,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1090,9 +1054,6 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1196,9 +1157,6 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1302,9 +1260,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1411,9 +1366,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1533,9 +1485,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1697,9 +1646,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1884,9 +1830,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2155,9 +2098,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2474,9 +2414,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2961,9 +2898,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3549,9 +3483,6 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4471,9 +4402,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5283,9 +5211,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5370,9 +5295,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5460,9 +5382,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5545,9 +5464,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5638,9 +5554,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5750,9 +5663,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5868,13 +5778,10 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_bfe_u32 s4, s2, 0x80008
 ; GFX7-HSA-NEXT:    s_lshr_b32 s3, s2, 24
@@ -6011,9 +5918,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6175,13 +6079,10 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s4, s2, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s5, s3, 24
@@ -6402,9 +6303,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6676,13 +6574,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s8, s5, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s9, s4, 24
@@ -7073,9 +6968,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7567,13 +7459,10 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s12, s4, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s13, s5, 24
@@ -8316,9 +8205,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -9093,9 +8979,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9182,9 +9065,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9273,9 +9153,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9362,9 +9239,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9456,9 +9330,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9560,9 +9431,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9677,9 +9545,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -9816,9 +9681,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -9977,9 +9839,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -10184,9 +10043,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -10436,9 +10292,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10797,9 +10650,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11249,9 +11099,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16:
 ; GFX7-HSA:       ; %bb.0:
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -11916,9 +11763,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16:
 ; GFX7-HSA:       ; %bb.0:
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 6a973d0adeffa5..e0c2d00891250b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -28,9 +28,6 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
 ; GCN-HSA-LABEL: global_load_i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -136,9 +133,6 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v2i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -225,9 +219,6 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v3i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -359,9 +350,6 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v4i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -447,9 +435,6 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v8i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -538,9 +523,6 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
 ; GCN-HSA-LABEL: global_load_v16i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
@@ -691,9 +673,6 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
 ; GCN-HSA-LABEL: global_load_v16i16_align2:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
@@ -843,9 +822,6 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_zextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -931,9 +907,6 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_sextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1022,9 +995,6 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1110,9 +1080,6 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1203,9 +1170,6 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1305,9 +1269,6 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1409,9 +1370,6 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1522,9 +1480,6 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1642,9 +1597,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1760,9 +1712,6 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1885,9 +1834,6 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2037,9 +1983,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2204,9 +2147,6 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -2443,9 +2383,6 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2717,9 +2654,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -3131,9 +3065,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -3653,9 +3584,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -4460,9 +4388,6 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5228,9 +5153,6 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_zextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5328,9 +5250,6 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_sextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5426,9 +5345,6 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5521,9 +5437,6 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5622,9 +5535,6 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5734,9 +5644,6 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5855,9 +5762,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6003,9 +5907,6 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6166,10 +6067,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6184,11 +6085,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v4
@@ -6388,9 +6286,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6641,10 +6536,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6661,10 +6556,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v8
@@ -7024,9 +6916,6 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -7498,9 +7387,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -8203,9 +8089,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 8322eee8264957..4d7f1a9663c3dc 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -27,9 +27,6 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
 ; GCNX3-HSA-LABEL: global_load_i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -109,9 +106,6 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v2i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -192,9 +186,6 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v3i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -279,9 +270,6 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v4i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -364,9 +352,6 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v8i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
@@ -473,9 +458,6 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v9i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -607,9 +589,6 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v10i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -740,9 +719,6 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v11i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -878,9 +854,6 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v12i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -1014,9 +987,6 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v16i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -1164,9 +1134,6 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
 ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1250,9 +1217,6 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
 ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1337,9 +1301,6 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1423,9 +1384,6 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1513,9 +1471,6 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1613,9 +1568,6 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1718,10 +1670,8 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1729,7 +1679,6 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v2
@@ -1847,9 +1796,6 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1991,10 +1937,8 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2009,7 +1953,6 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
@@ -2187,9 +2130,6 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2426,9 +2366,6 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2790,10 +2727,8 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -2827,7 +2762,6 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s1
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s0
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
@@ -3184,9 +3118,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -3658,7 +3589,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-GFX900-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-GFX900-HSA-NEXT:    s_add_u32 s16, s16, s15
+; GCN-GFX900-HSA-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-GFX900-HSA-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
@@ -3978,9 +3909,6 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -4505,9 +4433,6 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v32i32:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 1a6fa3c518ca7a..4dfc773d615e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -13,8 +13,7 @@
 ; GCN: s_cselect_b32
 
 ; GCN-NOT: load_dword
-; GCN: flat_load_dword
-; GCN: flat_load_dword
+; GCN: flat_load_dwordx2
 ; GCN-NOT: load_dword
 
 ; GCN: flat_store_dwordx2
diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 245a2775d9f2fb..e876a8d9dda692 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
 
 ; GCN-LABEL: {{^}}get_global_id_0:
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
-; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]]
+; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0
 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 {
   %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index e8632871f56ea4..92536c2078514a 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX9-NEXT:    s_mul_i32 s12, s12, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s12
+; GFX9-NEXT:    s_mul_i32 s10, s10, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s10
 ; GFX9-NEXT:    v_add_u32_e32 v0, s5, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
@@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX10-NEXT:    s_mul_i32 s12, s12, s4
-; GFX10-NEXT:    v_add3_u32 v0, s5, s12, v0
+; GFX10-NEXT:    s_mul_i32 s10, s10, s4
+; GFX10-NEXT:    v_add3_u32 v0, s5, s10, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0348737a41a30f..0a76e169e9c385 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -9,8 +9,6 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
 ; CHECK-LABEL: memcpy_p0_p0_minsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
@@ -188,7 +186,7 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s15
+; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
 ; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
@@ -568,9 +566,7 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s15
+; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
@@ -985,8 +981,6 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
 ; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
 ; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
@@ -1261,8 +1255,6 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
 ; CHECK-LABEL: memcpy_p0_p0_optsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
@@ -1440,7 +1432,7 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s15
+; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
 ; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
@@ -1820,9 +1812,7 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s15
+; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
@@ -2237,8 +2227,6 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
 ; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
 ; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 7868fa9a7ce4c9..1c33d8a19890db 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX7-LABEL: flat_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -217,9 +202,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_load(
 ; GFX7-LABEL: flat_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -419,9 +389,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX7-LABEL: flat_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -438,10 +405,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -460,10 +423,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -497,8 +456,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -514,8 +471,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -636,9 +591,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX7-LABEL: flat_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -656,10 +608,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -680,10 +628,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -720,8 +664,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -738,8 +680,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -879,9 +819,6 @@ entry:
 define amdgpu_kernel void @flat_agent_unordered_store(
 ; GFX7-LABEL: flat_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -893,10 +830,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -909,10 +842,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -936,8 +865,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -949,8 +876,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1034,9 +959,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_store(
 ; GFX7-LABEL: flat_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1048,10 +970,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1064,10 +982,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1091,8 +1005,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1104,8 +1016,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1189,9 +1099,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_store(
 ; GFX7-LABEL: flat_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1204,10 +1111,6 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1222,10 +1125,6 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1252,8 +1151,6 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1266,8 +1163,6 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1370,9 +1265,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_store(
 ; GFX7-LABEL: flat_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1385,10 +1277,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1403,10 +1291,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1433,8 +1317,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1447,8 +1329,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1551,9 +1431,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1566,10 +1443,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1583,10 +1456,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1612,8 +1481,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1626,8 +1493,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1720,9 +1585,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1737,10 +1599,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1758,10 +1616,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1792,8 +1646,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1808,8 +1660,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1920,9 +1770,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1936,10 +1783,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1955,10 +1798,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1987,8 +1826,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2002,8 +1839,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2115,9 +1950,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2133,10 +1965,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2156,10 +1984,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2193,8 +2017,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2210,8 +2032,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2341,9 +2161,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2359,10 +2176,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2382,10 +2195,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2419,8 +2228,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2436,8 +2243,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2567,9 +2372,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2586,10 +2388,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2608,10 +2406,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2645,8 +2439,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2662,8 +2454,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2782,9 +2572,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2802,10 +2589,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2826,10 +2609,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2866,8 +2645,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2884,8 +2661,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3027,9 +2802,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3047,10 +2819,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3071,10 +2839,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3111,8 +2875,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3129,8 +2891,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3272,9 +3032,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3300,10 +3057,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3329,10 +3082,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3383,8 +3132,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3400,8 +3147,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3520,9 +3265,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3550,10 +3292,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3583,10 +3321,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3642,8 +3376,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3661,8 +3393,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3799,9 +3529,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3828,10 +3555,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3859,10 +3582,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3916,8 +3635,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3934,8 +3651,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4073,9 +3788,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4104,10 +3816,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4139,10 +3847,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4201,8 +3905,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4221,8 +3923,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4378,9 +4078,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4409,10 +4106,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4444,10 +4137,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4506,8 +4195,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4526,8 +4213,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4683,9 +4368,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4713,10 +4395,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4746,10 +4424,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4805,8 +4479,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4824,8 +4496,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4962,9 +4632,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4992,10 +4659,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5025,10 +4688,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5084,8 +4743,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5103,8 +4760,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5241,9 +4896,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5272,10 +4924,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5307,10 +4955,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5369,8 +5013,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5389,8 +5031,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5546,9 +5186,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5577,10 +5214,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5612,10 +5245,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5674,8 +5303,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5694,8 +5321,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5851,9 +5476,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5882,10 +5504,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5917,10 +5535,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5979,8 +5593,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5999,8 +5611,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6156,9 +5766,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6187,10 +5794,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6222,10 +5825,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6284,8 +5883,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6304,8 +5901,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6461,9 +6056,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6492,10 +6084,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6527,10 +6115,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6589,8 +6173,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6609,8 +6191,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6766,9 +6346,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6797,10 +6374,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6832,10 +6405,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6894,8 +6463,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6914,8 +6481,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7071,9 +6636,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7102,10 +6664,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7137,10 +6695,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7199,8 +6753,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7219,8 +6771,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7376,9 +6926,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7407,10 +6954,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7442,10 +6985,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7504,8 +7043,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7524,8 +7061,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7681,9 +7216,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7712,10 +7244,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7744,10 +7272,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7804,8 +7328,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7824,8 +7346,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7971,9 +7491,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8003,10 +7520,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8037,10 +7550,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8099,8 +7608,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8120,8 +7627,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8276,9 +7781,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8308,10 +7810,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8342,10 +7840,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8405,8 +7899,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8426,8 +7918,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8592,9 +8082,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8625,10 +8112,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8661,10 +8144,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8726,8 +8205,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8748,8 +8225,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8927,9 +8402,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8960,10 +8432,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8996,10 +8464,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9061,8 +8525,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9083,8 +8545,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9262,9 +8722,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9294,10 +8751,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9328,10 +8781,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9390,8 +8839,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9411,8 +8858,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9571,9 +9016,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9603,10 +9045,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9637,10 +9075,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9699,8 +9133,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9720,8 +9152,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9876,9 +9306,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9909,10 +9336,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9945,10 +9368,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10010,8 +9429,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10032,8 +9449,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10211,9 +9626,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10244,10 +9656,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10280,10 +9688,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10345,8 +9749,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10367,8 +9769,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10546,9 +9946,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10579,10 +9976,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10615,10 +10008,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10680,8 +10069,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10702,8 +10089,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10881,9 +10266,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10914,10 +10296,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10950,10 +10328,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11015,8 +10389,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11037,8 +10409,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11216,9 +10586,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11249,10 +10616,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11285,10 +10648,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11350,8 +10709,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11372,8 +10729,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11547,9 +10902,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11580,10 +10932,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11616,10 +10964,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11681,8 +11025,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11703,8 +11045,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11882,9 +11222,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11915,10 +11252,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11951,10 +11284,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12016,8 +11345,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12038,8 +11365,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12217,9 +11542,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -12250,10 +11572,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12286,10 +11604,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12351,8 +11665,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12373,8 +11685,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12552,9 +11862,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ; GFX7-LABEL: flat_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12570,10 +11877,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12590,10 +11893,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12625,8 +11924,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12641,8 +11938,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12754,9 +12049,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12772,10 +12064,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12792,10 +12080,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12827,8 +12111,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12843,8 +12125,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12956,9 +12236,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX7-LABEL: flat_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12976,10 +12253,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12999,10 +12272,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13038,8 +12307,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13056,8 +12323,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13183,9 +12448,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -13204,10 +12466,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13229,10 +12487,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13271,8 +12525,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13290,8 +12542,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13436,9 +12686,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ; GFX7-LABEL: flat_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13450,10 +12697,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13466,10 +12709,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13493,8 +12732,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13506,8 +12743,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13591,9 +12826,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13605,10 +12837,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13621,10 +12849,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13648,8 +12872,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13661,8 +12883,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13746,9 +12966,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_store(
 ; GFX7-LABEL: flat_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13761,10 +12978,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13779,10 +12992,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13809,8 +13018,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13823,8 +13030,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13927,9 +13132,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13942,10 +13144,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13960,10 +13158,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13990,8 +13184,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14004,8 +13196,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14108,9 +13298,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14123,10 +13310,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14140,10 +13323,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14169,8 +13348,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14183,8 +13360,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14277,9 +13452,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14294,10 +13466,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14314,10 +13482,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14347,8 +13511,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14363,8 +13525,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14473,9 +13633,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14489,10 +13646,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14508,10 +13661,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14540,8 +13689,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14555,8 +13702,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14668,9 +13813,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14686,10 +13828,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14708,10 +13846,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14744,8 +13878,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14761,8 +13893,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14890,9 +14020,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14908,10 +14035,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14930,10 +14053,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14966,8 +14085,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14983,8 +14100,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -15112,9 +14227,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15132,10 +14244,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15155,10 +14263,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15194,8 +14298,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15212,8 +14314,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15337,9 +14437,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15358,10 +14455,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15383,10 +14476,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15425,8 +14514,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15444,8 +14531,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15592,9 +14677,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15613,10 +14695,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15638,10 +14716,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15680,8 +14754,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15699,8 +14771,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15847,9 +14917,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15875,10 +14942,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15904,10 +14967,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15958,8 +15017,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15975,8 +15032,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16095,9 +15150,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16125,10 +15177,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16157,10 +15205,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16215,8 +15259,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16234,8 +15276,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16370,9 +15410,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16399,10 +15436,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16430,10 +15463,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16487,8 +15516,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16505,8 +15532,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16644,9 +15669,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16675,10 +15697,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16709,10 +15727,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16770,8 +15784,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16790,8 +15802,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16945,9 +15955,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16976,10 +15983,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17010,10 +16013,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17071,8 +16070,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17091,8 +16088,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17246,9 +16241,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17276,10 +16268,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17308,10 +16296,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17366,8 +16350,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17385,8 +16367,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17521,9 +16501,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17551,10 +16528,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17583,10 +16556,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17641,8 +16610,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17660,8 +16627,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17796,9 +16761,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17827,10 +16789,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17861,10 +16819,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17922,8 +16876,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17942,8 +16894,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18097,9 +17047,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18128,10 +17075,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18162,10 +17105,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18223,8 +17162,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18243,8 +17180,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18398,9 +17333,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18429,10 +17361,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18463,10 +17391,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18524,8 +17448,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18544,8 +17466,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18699,9 +17619,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18730,10 +17647,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18764,10 +17677,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18825,8 +17734,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18845,8 +17752,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19000,9 +17905,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19031,10 +17933,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19065,10 +17963,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19126,8 +18020,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19146,8 +18038,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19301,9 +18191,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19332,10 +18219,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19366,10 +18249,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19427,8 +18306,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19447,8 +18324,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19602,9 +18477,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19633,10 +18505,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19667,10 +18535,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19728,8 +18592,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19748,8 +18610,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19903,9 +18763,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19934,10 +18791,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19968,10 +18821,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -20029,8 +18878,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20049,8 +18896,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20204,9 +19049,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20235,10 +19077,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20267,10 +19105,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20327,8 +19161,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20347,8 +19179,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20494,9 +19324,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20527,10 +19354,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20562,10 +19385,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20626,8 +19445,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20648,8 +19465,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20809,9 +19624,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20841,10 +19653,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20875,10 +19683,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20938,8 +19742,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20959,8 +19761,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21125,9 +19925,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21159,10 +19956,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21196,10 +19989,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21263,8 +20052,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21286,8 +20073,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21470,9 +20255,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21504,10 +20286,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21541,10 +20319,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21608,8 +20382,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21631,8 +20403,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21815,9 +20585,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21848,10 +20615,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21883,10 +20646,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21947,8 +20706,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21969,8 +20726,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22134,9 +20889,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22167,10 +20919,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22202,10 +20950,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22266,8 +21010,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22288,8 +21030,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22449,9 +21189,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22483,10 +21220,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22520,10 +21253,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22587,8 +21316,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22610,8 +21337,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22794,9 +21519,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22828,10 +21550,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22865,10 +21583,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22932,8 +21646,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22955,8 +21667,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23139,9 +21849,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23173,10 +21880,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23210,10 +21913,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23277,8 +21976,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23300,8 +21997,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23484,9 +22179,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23518,10 +22210,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23555,10 +22243,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23622,8 +22306,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23645,8 +22327,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23829,9 +22509,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23863,10 +22540,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23900,10 +22573,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23967,8 +22636,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23990,8 +22657,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24170,9 +22835,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24204,10 +22866,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24241,10 +22899,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24308,8 +22962,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24331,8 +22983,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24515,9 +23165,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24549,10 +23196,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24586,10 +23229,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24653,8 +23292,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24676,8 +23313,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24860,9 +23495,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24894,10 +23526,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24931,10 +23559,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24998,8 +23622,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -25021,8 +23643,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index f189562bafe5f7..b2340caa2933f9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -217,9 +202,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -248,10 +230,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -279,10 +257,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -338,8 +312,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -369,8 +341,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -581,9 +551,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -599,10 +566,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -619,10 +582,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -654,8 +613,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -670,8 +627,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -783,9 +738,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -814,10 +766,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -846,10 +794,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -906,8 +850,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -938,8 +880,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1151,9 +1091,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ; GFX7-LABEL: flat_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -1170,10 +1107,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1191,10 +1124,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1228,8 +1157,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1245,8 +1172,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 089d2a69facc17..304c80d7bb24d4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX7-LABEL: flat_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -217,9 +202,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -419,9 +389,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_load(
 ; GFX7-LABEL: flat_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -437,10 +404,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -457,10 +420,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -492,8 +451,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -508,8 +465,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -621,9 +576,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -639,10 +591,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -659,10 +607,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -694,8 +638,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -710,8 +652,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -823,9 +763,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_unordered_store(
 ; GFX7-LABEL: flat_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -837,10 +774,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -853,10 +786,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -880,8 +809,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -893,8 +820,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -978,9 +903,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -992,10 +914,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1008,10 +926,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1035,8 +949,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1048,8 +960,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1133,9 +1043,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_store(
 ; GFX7-LABEL: flat_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1147,10 +1054,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1163,10 +1066,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1190,8 +1089,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1203,8 +1100,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1288,9 +1183,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1302,10 +1194,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1318,10 +1206,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1345,8 +1229,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1358,8 +1240,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1443,9 +1323,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1458,10 +1335,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1475,10 +1348,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1504,8 +1373,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1518,8 +1385,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1612,9 +1477,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1627,10 +1489,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1644,10 +1502,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1673,8 +1527,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1687,8 +1539,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1781,9 +1631,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1796,10 +1643,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1813,10 +1656,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1842,8 +1681,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1856,8 +1693,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1950,9 +1785,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1965,10 +1797,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1982,10 +1810,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2011,8 +1835,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2025,8 +1847,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2119,9 +1939,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2134,10 +1951,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2151,10 +1964,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2180,8 +1989,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2194,8 +2001,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2288,9 +2093,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2306,10 +2108,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2326,10 +2124,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2361,8 +2155,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2377,8 +2169,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2488,9 +2278,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2506,10 +2293,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2526,10 +2309,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2561,8 +2340,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2577,8 +2354,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2688,9 +2463,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2706,10 +2478,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2726,10 +2494,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2761,8 +2525,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2777,8 +2539,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2888,9 +2648,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2916,10 +2673,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2945,10 +2698,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2999,8 +2748,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3016,8 +2763,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3136,9 +2881,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3164,10 +2906,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3193,10 +2931,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3247,8 +2981,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3264,8 +2996,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3384,9 +3114,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3412,10 +3139,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3441,10 +3164,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3495,8 +3214,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3512,8 +3229,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3632,9 +3347,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3660,10 +3372,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3689,10 +3397,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3743,8 +3447,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3760,8 +3462,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3880,9 +3580,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3908,10 +3605,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3937,10 +3630,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3991,8 +3680,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4008,8 +3695,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4128,9 +3813,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4156,10 +3838,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4185,10 +3863,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4239,8 +3913,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4256,8 +3928,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4376,9 +4046,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4404,10 +4071,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4433,10 +4096,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4487,8 +4146,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4504,8 +4161,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4624,9 +4279,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4652,10 +4304,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4681,10 +4329,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4735,8 +4379,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4752,8 +4394,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4872,9 +4512,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4900,10 +4537,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4929,10 +4562,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4983,8 +4612,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5000,8 +4627,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5120,9 +4745,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5148,10 +4770,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5177,10 +4795,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5231,8 +4845,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5248,8 +4860,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5368,9 +4978,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5396,10 +5003,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5425,10 +5028,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5479,8 +5078,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5496,8 +5093,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5616,9 +5211,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5644,10 +5236,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5673,10 +5261,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5727,8 +5311,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5744,8 +5326,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5864,9 +5444,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5892,10 +5469,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5921,10 +5494,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5975,8 +5544,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5992,8 +5559,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6112,9 +5677,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6140,10 +5702,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6169,10 +5727,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6223,8 +5777,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6240,8 +5792,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6360,9 +5910,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6388,10 +5935,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6417,10 +5960,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6471,8 +6010,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6488,8 +6025,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6608,9 +6143,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6639,10 +6171,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6671,10 +6199,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6731,8 +6255,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6751,8 +6273,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6898,9 +6418,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6929,10 +6446,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6961,10 +6474,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7021,8 +6530,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7041,8 +6548,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7188,9 +6693,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7219,10 +6721,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7251,10 +6749,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7311,8 +6805,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7331,8 +6823,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7478,9 +6968,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7509,10 +6996,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7541,10 +7024,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7601,8 +7080,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7621,8 +7098,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7768,9 +7243,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7799,10 +7271,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7831,10 +7299,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7891,8 +7355,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7911,8 +7373,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8058,9 +7518,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8089,10 +7546,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8121,10 +7574,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8181,8 +7630,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8201,8 +7648,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8348,9 +7793,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8379,10 +7821,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8411,10 +7849,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8471,8 +7905,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8491,8 +7923,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8638,9 +8068,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8669,10 +8096,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8701,10 +8124,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8761,8 +8180,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8781,8 +8198,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8928,9 +8343,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8959,10 +8371,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8991,10 +8399,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9051,8 +8455,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9071,8 +8473,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9218,9 +8618,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9249,10 +8646,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9281,10 +8674,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9341,8 +8730,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9361,8 +8748,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9508,9 +8893,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9539,10 +8921,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9571,10 +8949,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9631,8 +9005,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9651,8 +9023,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9798,9 +9168,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9829,10 +9196,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9861,10 +9224,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9921,8 +9280,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9941,8 +9298,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10088,9 +9443,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10119,10 +9471,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10151,10 +9499,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10211,8 +9555,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10231,8 +9573,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10378,9 +9718,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10409,10 +9746,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10441,10 +9774,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10501,8 +9830,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10521,8 +9848,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10668,9 +9993,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10699,10 +10021,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10731,10 +10049,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10791,8 +10105,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10811,8 +10123,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10958,9 +10268,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10976,10 +10283,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10996,10 +10299,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11031,8 +10330,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11047,8 +10344,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11160,9 +10455,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11178,10 +10470,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11198,10 +10486,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11233,8 +10517,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11249,8 +10531,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11362,9 +10642,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11380,10 +10657,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11400,10 +10673,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11435,8 +10704,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11451,8 +10718,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11564,9 +10829,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11582,10 +10844,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11602,10 +10860,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11637,8 +10891,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11653,8 +10905,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11766,9 +11016,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11780,10 +11027,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11796,10 +11039,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11823,8 +11062,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11836,8 +11073,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11921,9 +11156,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11935,10 +11167,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11951,10 +11179,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11978,8 +11202,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11991,8 +11213,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12076,9 +11296,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ; GFX7-LABEL: flat_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12090,10 +11307,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12106,10 +11319,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12133,8 +11342,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12146,8 +11353,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12231,9 +11436,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12245,10 +11447,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12261,10 +11459,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12288,8 +11482,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12301,8 +11493,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12386,9 +11576,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12401,10 +11588,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12418,10 +11601,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12447,8 +11626,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12461,8 +11638,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12555,9 +11730,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12570,10 +11742,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12587,10 +11755,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12616,8 +11780,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12630,8 +11792,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12724,9 +11884,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12739,10 +11896,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12756,10 +11909,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12785,8 +11934,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12799,8 +11946,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12893,9 +12038,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12908,10 +12050,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12925,10 +12063,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12954,8 +12088,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12968,8 +12100,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13062,9 +12192,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13077,10 +12204,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13094,10 +12217,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13123,8 +12242,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13137,8 +12254,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13231,9 +12346,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13249,10 +12361,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13269,10 +12377,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13304,8 +12408,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13320,8 +12422,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13431,9 +12531,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13449,10 +12546,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13469,10 +12562,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13504,8 +12593,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13520,8 +12607,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13631,9 +12716,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13649,10 +12731,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13669,10 +12747,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13704,8 +12778,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13720,8 +12792,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13831,9 +12901,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13859,10 +12926,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13888,10 +12951,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13942,8 +13001,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13959,8 +13016,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14079,9 +13134,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14107,10 +13159,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14136,10 +13184,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14190,8 +13234,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14207,8 +13249,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14327,9 +13367,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14355,10 +13392,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14384,10 +13417,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14438,8 +13467,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14455,8 +13482,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14575,9 +13600,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14603,10 +13625,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14632,10 +13650,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14686,8 +13700,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14703,8 +13715,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14823,9 +13833,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14851,10 +13858,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14880,10 +13883,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14934,8 +13933,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14951,8 +13948,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15071,9 +14066,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15099,10 +14091,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15128,10 +14116,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15182,8 +14166,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15199,8 +14181,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15319,9 +14299,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15347,10 +14324,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15376,10 +14349,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15430,8 +14399,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15447,8 +14414,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15567,9 +14532,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15595,10 +14557,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15624,10 +14582,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15678,8 +14632,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15695,8 +14647,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15815,9 +14765,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15843,10 +14790,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15872,10 +14815,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15926,8 +14865,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15943,8 +14880,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16063,9 +14998,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16091,10 +15023,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16120,10 +15048,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16174,8 +15098,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16191,8 +15113,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16311,9 +15231,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16339,10 +15256,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16368,10 +15281,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16422,8 +15331,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16439,8 +15346,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16559,9 +15464,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16587,10 +15489,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16616,10 +15514,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16670,8 +15564,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16687,8 +15579,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16807,9 +15697,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16835,10 +15722,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16864,10 +15747,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16918,8 +15797,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16935,8 +15812,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17055,9 +15930,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17083,10 +15955,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17112,10 +15980,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17166,8 +16030,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17183,8 +16045,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17303,9 +16163,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17331,10 +16188,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17360,10 +16213,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17414,8 +16263,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17431,8 +16278,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17551,9 +16396,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17582,10 +16424,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17614,10 +16452,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17674,8 +16508,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17694,8 +16526,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17841,9 +16671,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17872,10 +16699,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17904,10 +16727,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17964,8 +16783,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17984,8 +16801,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18131,9 +16946,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18162,10 +16974,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18194,10 +17002,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18254,8 +17058,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18274,8 +17076,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18421,9 +17221,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18452,10 +17249,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18484,10 +17277,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18544,8 +17333,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18564,8 +17351,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18711,9 +17496,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18742,10 +17524,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18774,10 +17552,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18834,8 +17608,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18854,8 +17626,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19001,9 +17771,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19032,10 +17799,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19064,10 +17827,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19124,8 +17883,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19144,8 +17901,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19291,9 +18046,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19322,10 +18074,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19354,10 +18102,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19414,8 +18158,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19434,8 +18176,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19581,9 +18321,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19612,10 +18349,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19644,10 +18377,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19704,8 +18433,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19724,8 +18451,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19871,9 +18596,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19902,10 +18624,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19934,10 +18652,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19994,8 +18708,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20014,8 +18726,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20161,9 +18871,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20192,10 +18899,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20224,10 +18927,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20284,8 +18983,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20304,8 +19001,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20451,9 +19146,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20482,10 +19174,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20514,10 +19202,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20574,8 +19258,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20594,8 +19276,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20741,9 +19421,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20772,10 +19449,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20804,10 +19477,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20864,8 +19533,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20884,8 +19551,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21031,9 +19696,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21062,10 +19724,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21094,10 +19752,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21154,8 +19808,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21174,8 +19826,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21321,9 +19971,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21352,10 +19999,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21384,10 +20027,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21444,8 +20083,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21464,8 +20101,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21611,9 +20246,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21642,10 +20274,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21674,10 +20302,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21734,8 +20358,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21754,8 +20376,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 69c03ba6a39796..038b58deb0cf19 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX7-LABEL: flat_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -217,9 +202,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_load(
 ; GFX7-LABEL: flat_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -419,9 +389,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX7-LABEL: flat_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -438,10 +405,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -460,10 +423,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -497,8 +456,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -515,8 +472,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -638,9 +593,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX7-LABEL: flat_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -658,10 +610,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -682,10 +630,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -722,8 +666,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -741,8 +683,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -883,9 +823,6 @@ entry:
 define amdgpu_kernel void @flat_system_unordered_store(
 ; GFX7-LABEL: flat_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -897,10 +834,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -913,10 +846,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -940,8 +869,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -953,8 +880,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1038,9 +963,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_store(
 ; GFX7-LABEL: flat_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1052,10 +974,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1068,10 +986,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1095,8 +1009,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1108,8 +1020,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1193,9 +1103,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_store(
 ; GFX7-LABEL: flat_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1208,10 +1115,6 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1226,10 +1129,6 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1256,8 +1155,6 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1271,8 +1168,6 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1376,9 +1271,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_store(
 ; GFX7-LABEL: flat_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1391,10 +1283,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1409,10 +1297,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1439,8 +1323,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1454,8 +1336,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1559,9 +1439,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1574,10 +1451,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1591,10 +1464,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1620,8 +1489,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1634,8 +1501,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1728,9 +1593,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1745,10 +1607,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1766,10 +1624,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1800,8 +1654,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1817,8 +1669,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1930,9 +1780,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_atomicrmw(
 ; GFX7-LABEL: flat_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1946,10 +1793,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1965,10 +1808,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1997,8 +1836,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2013,8 +1850,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2127,9 +1962,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2145,10 +1977,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2168,10 +1996,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2205,8 +2029,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2224,8 +2046,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2357,9 +2177,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2375,10 +2192,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2398,10 +2211,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2435,8 +2244,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2454,8 +2261,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2587,9 +2392,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2606,10 +2408,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2628,10 +2426,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2665,8 +2459,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2683,8 +2475,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2804,9 +2594,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2824,10 +2611,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2848,10 +2631,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2888,8 +2667,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2908,8 +2685,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3053,9 +2828,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3073,10 +2845,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3097,10 +2865,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3137,8 +2901,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3157,8 +2919,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3302,9 +3062,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3330,10 +3087,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3359,10 +3112,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3413,8 +3162,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3430,8 +3177,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3550,9 +3295,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3580,10 +3322,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3613,10 +3351,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3672,8 +3406,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3692,8 +3424,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3831,9 +3561,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3860,10 +3587,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3891,10 +3614,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3948,8 +3667,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3967,8 +3684,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4107,9 +3822,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4138,10 +3850,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4173,10 +3881,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4235,8 +3939,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4257,8 +3959,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4416,9 +4116,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4447,10 +4144,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4482,10 +4175,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4544,8 +4233,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4566,8 +4253,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4725,9 +4410,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4755,10 +4437,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4788,10 +4466,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4847,8 +4521,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4867,8 +4539,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5006,9 +4676,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5036,10 +4703,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5069,10 +4732,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5128,8 +4787,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5148,8 +4805,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5287,9 +4942,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5318,10 +4970,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5353,10 +5001,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5415,8 +5059,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5437,8 +5079,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5596,9 +5236,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5627,10 +5264,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5662,10 +5295,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5724,8 +5353,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5746,8 +5373,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5905,9 +5530,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5936,10 +5558,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5971,10 +5589,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6033,8 +5647,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6055,8 +5667,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6214,9 +5824,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6245,10 +5852,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6280,10 +5883,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6342,8 +5941,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6364,8 +5961,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6523,9 +6118,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6554,10 +6146,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6589,10 +6177,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6651,8 +6235,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6673,8 +6255,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6832,9 +6412,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6863,10 +6440,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6898,10 +6471,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6960,8 +6529,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6982,8 +6549,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7141,9 +6706,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7172,10 +6734,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7207,10 +6765,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7269,8 +6823,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7291,8 +6843,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7450,9 +7000,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7481,10 +7028,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7516,10 +7059,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7578,8 +7117,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7600,8 +7137,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7759,9 +7294,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7790,10 +7322,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7822,10 +7350,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7882,8 +7406,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7902,8 +7424,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8049,9 +7569,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8081,10 +7598,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8115,10 +7628,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8177,8 +7686,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8199,8 +7706,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8356,9 +7861,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8388,10 +7890,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8422,10 +7920,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8485,8 +7979,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8507,8 +7999,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8674,9 +8164,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8707,10 +8194,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8743,10 +8226,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8808,8 +8287,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8832,8 +8309,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9013,9 +8488,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9046,10 +8518,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9082,10 +8550,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9147,8 +8611,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9171,8 +8633,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9352,9 +8812,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9384,10 +8841,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9418,10 +8871,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9480,8 +8929,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9502,8 +8949,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9663,9 +9108,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9695,10 +9137,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9729,10 +9167,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9791,8 +9225,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9813,8 +9245,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9970,9 +9400,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10003,10 +9430,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10039,10 +9462,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10104,8 +9523,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10128,8 +9545,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10309,9 +9724,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10342,10 +9754,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10378,10 +9786,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10443,8 +9847,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10467,8 +9869,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10648,9 +10048,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10681,10 +10078,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10717,10 +10110,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10782,8 +10171,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10806,8 +10193,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10987,9 +10372,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11020,10 +10402,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11056,10 +10434,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11121,8 +10495,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11145,8 +10517,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11326,9 +10696,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11359,10 +10726,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11395,10 +10758,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11460,8 +10819,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11484,8 +10841,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11661,9 +11016,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11694,10 +11046,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11730,10 +11078,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11795,8 +11139,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11819,8 +11161,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12000,9 +11340,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -12033,10 +11370,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12069,10 +11402,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12134,8 +11463,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12158,8 +11485,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12339,9 +11664,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -12372,10 +11694,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12408,10 +11726,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12473,8 +11787,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12497,8 +11809,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12678,9 +11988,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ; GFX7-LABEL: flat_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12696,10 +12003,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12716,10 +12019,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12751,8 +12050,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12767,8 +12064,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12880,9 +12175,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ; GFX7-LABEL: flat_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12898,10 +12190,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12918,10 +12206,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12953,8 +12237,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12969,8 +12251,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13082,9 +12362,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX7-LABEL: flat_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -13102,10 +12379,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13125,10 +12398,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13164,8 +12433,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13183,8 +12450,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13311,9 +12576,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -13332,10 +12594,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13357,10 +12615,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13399,8 +12653,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13419,8 +12671,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13566,9 +12816,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ; GFX7-LABEL: flat_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13580,10 +12827,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13596,10 +12839,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13623,8 +12862,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13636,8 +12873,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13721,9 +12956,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ; GFX7-LABEL: flat_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13735,10 +12967,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13751,10 +12979,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13778,8 +13002,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13791,8 +13013,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13876,9 +13096,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_store(
 ; GFX7-LABEL: flat_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13891,10 +13108,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13909,10 +13122,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13939,8 +13148,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13954,8 +13161,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14059,9 +13264,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14074,10 +13276,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14092,10 +13290,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14122,8 +13316,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14137,8 +13329,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14242,9 +13432,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14257,10 +13444,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14274,10 +13457,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14303,8 +13482,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14317,8 +13494,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14411,9 +13586,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14428,10 +13600,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14448,10 +13616,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14481,8 +13645,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14498,8 +13660,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14609,9 +13769,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14625,10 +13782,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14644,10 +13797,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14676,8 +13825,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14692,8 +13839,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14806,9 +13951,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14824,10 +13966,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14846,10 +13984,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14882,8 +14016,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14901,8 +14033,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -15032,9 +14162,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -15050,10 +14177,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -15072,10 +14195,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -15108,8 +14227,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -15127,8 +14244,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -15258,9 +14373,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15278,10 +14390,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15301,10 +14409,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15340,8 +14444,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15359,8 +14461,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15485,9 +14585,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15506,10 +14603,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15531,10 +14624,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15573,8 +14662,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15594,8 +14681,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15744,9 +14829,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15765,10 +14847,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15790,10 +14868,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15832,8 +14906,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15853,8 +14925,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -16003,9 +15073,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16031,10 +15098,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16060,10 +15123,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16114,8 +15173,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16131,8 +15188,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16251,9 +15306,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16281,10 +15333,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16313,10 +15361,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16371,8 +15415,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16391,8 +15433,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16528,9 +15568,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16557,10 +15594,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16588,10 +15621,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16645,8 +15674,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16664,8 +15691,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16804,9 +15829,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16835,10 +15857,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16869,10 +15887,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16930,8 +15944,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16952,8 +15964,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17109,9 +16119,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17140,10 +16147,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17174,10 +16177,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17235,8 +16234,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17257,8 +16254,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17414,9 +16409,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17444,10 +16436,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17476,10 +16464,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17534,8 +16518,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17554,8 +16536,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17691,9 +16671,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17721,10 +16698,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17753,10 +16726,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17811,8 +16780,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17831,8 +16798,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17968,9 +16933,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17999,10 +16961,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18033,10 +16991,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18094,8 +17048,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18116,8 +17068,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18273,9 +17223,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18304,10 +17251,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18338,10 +17281,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18399,8 +17338,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18421,8 +17358,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18578,9 +17513,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18609,10 +17541,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18643,10 +17571,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18704,8 +17628,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18726,8 +17648,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18883,9 +17803,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18914,10 +17831,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18948,10 +17861,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19009,8 +17918,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19031,8 +17938,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19188,9 +18093,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19219,10 +18121,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19253,10 +18151,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19314,8 +18208,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19336,8 +18228,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19493,9 +18383,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19524,10 +18411,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19558,10 +18441,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19619,8 +18498,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19641,8 +18518,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19798,9 +18673,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19829,10 +18701,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19863,10 +18731,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19924,8 +18788,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19946,8 +18808,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20103,9 +18963,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -20134,10 +18991,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -20168,10 +19021,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -20229,8 +19078,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20251,8 +19098,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20408,9 +19253,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20439,10 +19281,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20471,10 +19309,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20531,8 +19365,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20551,8 +19383,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20698,9 +19528,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20731,10 +19558,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20766,10 +19589,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20830,8 +19649,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20853,8 +19670,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21015,9 +19830,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21047,10 +19859,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21081,10 +19889,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21144,8 +19948,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21166,8 +19968,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21333,9 +20133,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21367,10 +20164,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21404,10 +20197,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21471,8 +20260,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21496,8 +20283,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21682,9 +20467,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21716,10 +20498,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21753,10 +20531,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21820,8 +20594,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21845,8 +20617,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22031,9 +20801,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22064,10 +20831,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22099,10 +20862,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22163,8 +20922,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22186,8 +20943,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22352,9 +21107,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22385,10 +21137,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22420,10 +21168,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22484,8 +21228,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22507,8 +21249,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22669,9 +21409,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22703,10 +21440,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22740,10 +21473,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22807,8 +21536,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22832,8 +21559,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23018,9 +21743,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23052,10 +21774,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23089,10 +21807,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23156,8 +21870,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23181,8 +21893,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23367,9 +22077,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23401,10 +22108,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23438,10 +22141,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23505,8 +22204,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23530,8 +22227,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23716,9 +22411,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23750,10 +22442,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23787,10 +22475,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23854,8 +22538,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23879,8 +22561,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24065,9 +22745,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24099,10 +22776,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24136,10 +22809,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24203,8 +22872,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24228,8 +22895,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24410,9 +23075,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24444,10 +23106,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24481,10 +23139,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24548,8 +23202,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24573,8 +23225,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24759,9 +23409,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24793,10 +23440,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24830,10 +23473,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24897,8 +23536,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24922,8 +23559,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -25108,9 +23743,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -25142,10 +23774,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -25179,10 +23807,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -25246,8 +23870,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -25271,8 +23893,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 5f788e2e41ac55..f1b465c1789da7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -11,9 +11,6 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,10 +27,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -51,10 +44,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -165,9 +154,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -197,10 +183,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -229,10 +211,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -433,9 +411,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -452,10 +427,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -473,10 +444,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -591,9 +558,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -623,10 +587,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -656,10 +616,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -865,9 +821,6 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -884,10 +837,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -905,10 +854,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1015,9 +960,6 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX7-LABEL: flat_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1030,10 +972,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1048,10 +986,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index dad713198cc89a..23982f8a00cdb8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX7-LABEL: flat_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -217,9 +202,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -419,9 +389,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_load(
 ; GFX7-LABEL: flat_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -437,10 +404,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -457,10 +420,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -492,8 +451,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -508,8 +465,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -621,9 +576,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -639,10 +591,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -659,10 +607,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -694,8 +638,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -710,8 +652,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -823,9 +763,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_unordered_store(
 ; GFX7-LABEL: flat_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -837,10 +774,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -853,10 +786,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -880,8 +809,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -893,8 +820,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -978,9 +903,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -992,10 +914,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1008,10 +926,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1035,8 +949,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1048,8 +960,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1133,9 +1043,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_store(
 ; GFX7-LABEL: flat_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1147,10 +1054,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1163,10 +1066,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1190,8 +1089,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1203,8 +1100,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1288,9 +1183,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1302,10 +1194,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1318,10 +1206,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1345,8 +1229,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1358,8 +1240,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1443,9 +1323,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1458,10 +1335,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1475,10 +1348,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1504,8 +1373,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1518,8 +1385,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1612,9 +1477,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1627,10 +1489,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1644,10 +1502,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1673,8 +1527,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1687,8 +1539,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1781,9 +1631,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1796,10 +1643,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1813,10 +1656,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1842,8 +1681,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1856,8 +1693,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1950,9 +1785,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1965,10 +1797,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1982,10 +1810,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2011,8 +1835,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2025,8 +1847,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2119,9 +1939,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2134,10 +1951,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2151,10 +1964,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2180,8 +1989,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2194,8 +2001,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2288,9 +2093,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2306,10 +2108,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2326,10 +2124,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2361,8 +2155,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2377,8 +2169,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2488,9 +2278,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2506,10 +2293,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2526,10 +2309,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2561,8 +2340,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2577,8 +2354,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2688,9 +2463,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2706,10 +2478,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2726,10 +2494,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2761,8 +2525,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2777,8 +2539,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2888,9 +2648,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2916,10 +2673,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2945,10 +2698,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2999,8 +2748,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3016,8 +2763,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3136,9 +2881,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3164,10 +2906,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3193,10 +2931,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3247,8 +2981,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3264,8 +2996,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3384,9 +3114,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3412,10 +3139,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3441,10 +3164,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3495,8 +3214,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3512,8 +3229,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3632,9 +3347,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3660,10 +3372,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3689,10 +3397,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3743,8 +3447,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3760,8 +3462,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3880,9 +3580,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3908,10 +3605,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3937,10 +3630,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3991,8 +3680,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4008,8 +3695,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4128,9 +3813,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4156,10 +3838,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4185,10 +3863,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4239,8 +3913,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4256,8 +3928,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4376,9 +4046,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4404,10 +4071,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4433,10 +4096,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4487,8 +4146,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4504,8 +4161,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4624,9 +4279,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4652,10 +4304,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4681,10 +4329,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4735,8 +4379,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4752,8 +4394,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4872,9 +4512,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4900,10 +4537,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4929,10 +4562,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4983,8 +4612,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5000,8 +4627,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5120,9 +4745,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5148,10 +4770,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5177,10 +4795,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5231,8 +4845,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5248,8 +4860,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5368,9 +4978,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5396,10 +5003,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5425,10 +5028,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5479,8 +5078,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5496,8 +5093,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5616,9 +5211,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5644,10 +5236,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5673,10 +5261,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5727,8 +5311,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5744,8 +5326,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5864,9 +5444,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5892,10 +5469,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5921,10 +5494,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5975,8 +5544,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5992,8 +5559,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6112,9 +5677,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6140,10 +5702,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6169,10 +5727,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6223,8 +5777,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6240,8 +5792,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6360,9 +5910,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6388,10 +5935,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6417,10 +5960,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6471,8 +6010,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6488,8 +6025,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6608,9 +6143,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6639,10 +6171,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6671,10 +6199,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6731,8 +6255,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6751,8 +6273,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6898,9 +6418,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6929,10 +6446,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6961,10 +6474,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7021,8 +6530,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7041,8 +6548,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7188,9 +6693,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7219,10 +6721,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7251,10 +6749,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7311,8 +6805,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7331,8 +6823,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7478,9 +6968,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7509,10 +6996,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7541,10 +7024,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7601,8 +7080,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7621,8 +7098,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7768,9 +7243,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7799,10 +7271,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7831,10 +7299,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7891,8 +7355,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7911,8 +7373,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8058,9 +7518,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8089,10 +7546,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8121,10 +7574,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8181,8 +7630,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8201,8 +7648,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8348,9 +7793,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8379,10 +7821,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8411,10 +7849,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8471,8 +7905,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8491,8 +7923,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8638,9 +8068,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8669,10 +8096,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8701,10 +8124,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8761,8 +8180,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8781,8 +8198,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8928,9 +8343,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8959,10 +8371,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8991,10 +8399,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9051,8 +8455,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9071,8 +8473,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9218,9 +8618,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9249,10 +8646,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9281,10 +8674,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9341,8 +8730,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9361,8 +8748,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9508,9 +8893,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9539,10 +8921,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9571,10 +8949,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9631,8 +9005,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9651,8 +9023,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9798,9 +9168,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9829,10 +9196,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9861,10 +9224,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9921,8 +9280,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9941,8 +9298,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10088,9 +9443,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10119,10 +9471,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10151,10 +9499,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10211,8 +9555,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10231,8 +9573,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10378,9 +9718,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10409,10 +9746,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10441,10 +9774,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10501,8 +9830,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10521,8 +9848,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10668,9 +9993,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10699,10 +10021,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10731,10 +10049,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10791,8 +10105,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10811,8 +10123,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10958,9 +10268,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10976,10 +10283,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10996,10 +10299,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11031,8 +10330,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11047,8 +10344,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11160,9 +10455,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11178,10 +10470,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11198,10 +10486,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11233,8 +10517,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11249,8 +10531,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11362,9 +10642,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11380,10 +10657,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11400,10 +10673,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11435,8 +10704,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11451,8 +10718,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11564,9 +10829,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11582,10 +10844,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11602,10 +10860,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11637,8 +10891,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11653,8 +10905,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11766,9 +11016,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11780,10 +11027,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11796,10 +11039,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11823,8 +11062,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11836,8 +11073,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11921,9 +11156,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11935,10 +11167,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11951,10 +11179,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11978,8 +11202,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11991,8 +11213,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12076,9 +11296,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ; GFX7-LABEL: flat_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12090,10 +11307,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12106,10 +11319,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12133,8 +11342,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12146,8 +11353,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12231,9 +11436,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12245,10 +11447,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12261,10 +11459,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12288,8 +11482,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12301,8 +11493,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12386,9 +11576,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12401,10 +11588,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12418,10 +11601,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12447,8 +11626,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12461,8 +11638,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12555,9 +11730,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12570,10 +11742,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12587,10 +11755,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12616,8 +11780,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12630,8 +11792,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12724,9 +11884,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12739,10 +11896,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12756,10 +11909,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12785,8 +11934,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12799,8 +11946,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12893,9 +12038,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12908,10 +12050,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12925,10 +12063,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12954,8 +12088,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12968,8 +12100,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13062,9 +12192,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13077,10 +12204,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13094,10 +12217,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13123,8 +12242,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13137,8 +12254,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13231,9 +12346,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13249,10 +12361,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13269,10 +12377,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13304,8 +12408,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13320,8 +12422,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13431,9 +12531,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13449,10 +12546,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13469,10 +12562,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13504,8 +12593,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13520,8 +12607,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13631,9 +12716,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13649,10 +12731,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13669,10 +12747,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13704,8 +12778,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13720,8 +12792,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13831,9 +12901,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13859,10 +12926,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13888,10 +12951,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13942,8 +13001,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13959,8 +13016,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14079,9 +13134,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14107,10 +13159,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14136,10 +13184,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14190,8 +13234,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14207,8 +13249,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14327,9 +13367,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14355,10 +13392,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14384,10 +13417,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14438,8 +13467,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14455,8 +13482,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14575,9 +13600,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14603,10 +13625,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14632,10 +13650,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14686,8 +13700,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14703,8 +13715,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14823,9 +13833,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14851,10 +13858,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14880,10 +13883,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14934,8 +13933,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14951,8 +13948,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15071,9 +14066,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15099,10 +14091,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15128,10 +14116,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15182,8 +14166,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15199,8 +14181,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15319,9 +14299,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15347,10 +14324,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15376,10 +14349,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15430,8 +14399,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15447,8 +14414,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15567,9 +14532,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15595,10 +14557,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15624,10 +14582,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15678,8 +14632,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15695,8 +14647,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15815,9 +14765,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15843,10 +14790,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15872,10 +14815,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15926,8 +14865,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15943,8 +14880,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16063,9 +14998,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16091,10 +15023,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16120,10 +15048,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16174,8 +15098,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16191,8 +15113,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16311,9 +15231,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16339,10 +15256,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16368,10 +15281,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16422,8 +15331,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16439,8 +15346,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16559,9 +15464,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16587,10 +15489,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16616,10 +15514,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16670,8 +15564,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16687,8 +15579,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16807,9 +15697,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16835,10 +15722,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16864,10 +15747,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16918,8 +15797,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16935,8 +15812,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17055,9 +15930,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17083,10 +15955,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17112,10 +15980,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17166,8 +16030,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17183,8 +16045,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17303,9 +16163,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17331,10 +16188,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17360,10 +16213,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17414,8 +16263,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17431,8 +16278,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17551,9 +16396,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17582,10 +16424,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17614,10 +16452,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17674,8 +16508,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17694,8 +16526,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17841,9 +16671,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17872,10 +16699,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17904,10 +16727,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17964,8 +16783,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17984,8 +16801,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18131,9 +16946,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18162,10 +16974,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18194,10 +17002,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18254,8 +17058,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18274,8 +17076,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18421,9 +17221,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18452,10 +17249,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18484,10 +17277,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18544,8 +17333,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18564,8 +17351,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18711,9 +17496,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18742,10 +17524,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18774,10 +17552,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18834,8 +17608,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18854,8 +17626,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19001,9 +17771,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19032,10 +17799,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19064,10 +17827,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19124,8 +17883,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19144,8 +17901,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19291,9 +18046,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19322,10 +18074,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19354,10 +18102,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19414,8 +18158,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19434,8 +18176,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19581,9 +18321,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19612,10 +18349,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19644,10 +18377,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19704,8 +18433,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19724,8 +18451,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19871,9 +18596,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19902,10 +18624,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19934,10 +18652,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19994,8 +18708,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20014,8 +18726,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20161,9 +18871,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20192,10 +18899,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20224,10 +18927,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20284,8 +18983,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20304,8 +19001,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20451,9 +19146,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20482,10 +19174,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20514,10 +19202,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20574,8 +19258,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20594,8 +19276,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20741,9 +19421,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20772,10 +19449,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20804,10 +19477,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20864,8 +19533,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20884,8 +19551,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21031,9 +19696,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21062,10 +19724,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21094,10 +19752,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21154,8 +19808,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21174,8 +19826,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21321,9 +19971,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21352,10 +19999,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21384,10 +20027,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21444,8 +20083,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21464,8 +20101,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index a350394bcafe50..5ddabad7374ddd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX7-LABEL: flat_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -217,9 +202,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -235,10 +217,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -255,10 +233,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -290,8 +264,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -306,8 +278,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -419,9 +389,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_load(
 ; GFX7-LABEL: flat_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -438,10 +405,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -459,10 +422,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -496,8 +455,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -513,8 +470,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -633,9 +588,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -653,10 +605,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -676,10 +624,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -715,8 +659,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -733,8 +675,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -866,9 +806,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_unordered_store(
 ; GFX7-LABEL: flat_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -880,10 +817,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -896,10 +829,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -923,8 +852,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -936,8 +863,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1021,9 +946,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1035,10 +957,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1051,10 +969,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1078,8 +992,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1091,8 +1003,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1176,9 +1086,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX7-LABEL: flat_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1191,10 +1098,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1209,10 +1112,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1238,8 +1137,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1252,8 +1149,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1349,9 +1244,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1364,10 +1256,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1382,10 +1270,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1411,8 +1295,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1425,8 +1307,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1522,9 +1402,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1537,10 +1414,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1554,10 +1427,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1583,8 +1452,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1597,8 +1464,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1691,9 +1556,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1707,10 +1569,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1727,10 +1585,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1758,8 +1612,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1773,8 +1625,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1879,9 +1729,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1895,10 +1742,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1914,10 +1757,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1945,8 +1784,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1960,8 +1797,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2066,9 +1901,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2083,10 +1915,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2105,10 +1933,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2138,8 +1962,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2154,8 +1976,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2272,9 +2092,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2289,10 +2106,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2311,10 +2124,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2344,8 +2153,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2360,8 +2167,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2478,9 +2283,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2497,10 +2299,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2518,10 +2316,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2555,8 +2349,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2572,8 +2364,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2690,9 +2480,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2710,10 +2497,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2733,10 +2516,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2772,8 +2551,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2790,8 +2567,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2922,9 +2697,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2942,10 +2714,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2965,10 +2733,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3004,8 +2768,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3022,8 +2784,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3154,9 +2914,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3182,10 +2939,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3211,10 +2964,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3265,8 +3014,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3282,8 +3029,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3402,9 +3147,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3431,10 +3173,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3463,10 +3201,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3519,8 +3253,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3537,8 +3269,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3669,9 +3399,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3698,10 +3425,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3729,10 +3452,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3785,8 +3504,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3803,8 +3520,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3935,9 +3650,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3965,10 +3677,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3999,10 +3707,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4057,8 +3761,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4076,8 +3778,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4220,9 +3920,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4250,10 +3947,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4284,10 +3977,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4342,8 +4031,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4361,8 +4048,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4505,9 +4190,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4534,10 +4216,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4566,10 +4244,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4622,8 +4296,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4640,8 +4312,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4772,9 +4442,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4801,10 +4468,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4833,10 +4496,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4889,8 +4548,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4907,8 +4564,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5039,9 +4694,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5069,10 +4721,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5103,10 +4751,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5161,8 +4805,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5180,8 +4822,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5324,9 +4964,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5354,10 +4991,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5388,10 +5021,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5446,8 +5075,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5465,8 +5092,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5609,9 +5234,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5639,10 +5261,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5673,10 +5291,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5731,8 +5345,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5750,8 +5362,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5894,9 +5504,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5924,10 +5531,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5958,10 +5561,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6016,8 +5615,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6035,8 +5632,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6179,9 +5774,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6210,10 +5802,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6242,10 +5830,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6302,8 +5886,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6322,8 +5904,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6469,9 +6049,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6501,10 +6078,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6534,10 +6107,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6596,8 +6165,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6617,8 +6184,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6771,9 +6336,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6803,10 +6365,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6837,10 +6395,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6899,8 +6453,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6920,8 +6472,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7079,9 +6629,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7112,10 +6659,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7147,10 +6690,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7211,8 +6750,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7233,8 +6770,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7401,9 +6936,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7434,10 +6966,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7469,10 +6997,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7533,8 +7057,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7555,8 +7077,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7723,9 +7243,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7755,10 +7272,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7788,10 +7301,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7850,8 +7359,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7871,8 +7378,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8027,9 +7532,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8059,10 +7561,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8092,10 +7590,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8154,8 +7648,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8175,8 +7667,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8329,9 +7819,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8362,10 +7849,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8397,10 +7880,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8461,8 +7940,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8483,8 +7960,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8651,9 +8126,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8684,10 +8156,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8719,10 +8187,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8783,8 +8247,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8805,8 +8267,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8973,9 +8433,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9006,10 +8463,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9041,10 +8494,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9105,8 +8554,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9127,8 +8574,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9295,9 +8740,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9328,10 +8770,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9363,10 +8801,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9427,8 +8861,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9449,8 +8881,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9617,9 +9047,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9650,10 +9077,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9685,10 +9108,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9749,8 +9168,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9771,8 +9188,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9937,9 +9352,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9970,10 +9382,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10005,10 +9413,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10069,8 +9473,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10091,8 +9493,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10259,9 +9659,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10292,10 +9689,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10327,10 +9720,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10391,8 +9780,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10413,8 +9800,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10581,9 +9966,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10614,10 +9996,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10649,10 +10027,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10713,8 +10087,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10735,8 +10107,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10903,9 +10273,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10921,10 +10288,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10941,10 +10304,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10976,8 +10335,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10992,8 +10349,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11105,9 +10460,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11123,10 +10475,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11143,10 +10491,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11178,8 +10522,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11194,8 +10536,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11307,9 +10647,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11325,10 +10662,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11347,10 +10680,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11382,8 +10711,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11398,8 +10725,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11517,9 +10842,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11535,10 +10857,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11559,10 +10877,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11594,8 +10908,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11610,8 +10922,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11739,9 +11049,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11753,10 +11060,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11769,10 +11072,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11796,8 +11095,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11809,8 +11106,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11894,9 +11189,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11908,10 +11200,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11924,10 +11212,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11951,8 +11235,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11964,8 +11246,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12049,9 +11329,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX7-LABEL: flat_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12063,10 +11340,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12081,10 +11354,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12108,8 +11377,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12121,8 +11388,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12215,9 +11480,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12229,10 +11491,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12247,10 +11505,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12274,8 +11528,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12287,8 +11539,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12381,9 +11631,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12396,10 +11643,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12413,10 +11656,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12442,8 +11681,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12456,8 +11693,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12550,9 +11785,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12565,10 +11797,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12584,10 +11812,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12613,8 +11837,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12627,8 +11849,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12729,9 +11949,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12744,10 +11961,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12763,10 +11976,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12792,8 +12001,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12806,8 +12013,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12909,9 +12114,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12924,10 +12126,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12945,10 +12143,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12974,8 +12168,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12988,8 +12180,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13099,9 +12289,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13114,10 +12301,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13135,10 +12318,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13164,8 +12343,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13178,8 +12355,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13289,9 +12464,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13307,10 +12479,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13329,10 +12497,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13364,8 +12528,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13380,8 +12542,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13497,9 +12657,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13515,10 +12672,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13539,10 +12692,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13574,8 +12723,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13590,8 +12737,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13718,9 +12863,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13736,10 +12878,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13760,10 +12898,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13795,8 +12929,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13811,8 +12943,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13939,9 +13069,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13967,10 +13094,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13996,10 +13119,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14050,8 +13169,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14067,8 +13184,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14187,9 +13302,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14215,10 +13327,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14246,10 +13354,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14300,8 +13404,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14317,8 +13419,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14445,9 +13545,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14473,10 +13570,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14504,10 +13597,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14558,8 +13647,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14575,8 +13662,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14704,9 +13789,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14732,10 +13814,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14765,10 +13843,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14819,8 +13893,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14836,8 +13908,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14973,9 +14043,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15001,10 +14068,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15034,10 +14097,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15088,8 +14147,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15105,8 +14162,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15242,9 +14297,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15270,10 +14322,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15301,10 +14349,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15355,8 +14399,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15372,8 +14414,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15500,9 +14540,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15528,10 +14565,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15559,10 +14592,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15613,8 +14642,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15630,8 +14657,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15758,9 +14783,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15786,10 +14808,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15819,10 +14837,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15873,8 +14887,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15890,8 +14902,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16027,9 +15037,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16055,10 +15062,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16088,10 +15091,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16142,8 +15141,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16159,8 +15156,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16296,9 +15291,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16324,10 +15316,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16357,10 +15345,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16411,8 +15395,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16428,8 +15410,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16565,9 +15545,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16593,10 +15570,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16626,10 +15599,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16680,8 +15649,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16697,8 +15664,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16834,9 +15799,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16862,10 +15824,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16895,10 +15853,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16949,8 +15903,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16966,8 +15918,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17103,9 +16053,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17131,10 +16078,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17164,10 +16107,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17218,8 +16157,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17235,8 +16172,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17372,9 +16307,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17400,10 +16332,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17433,10 +16361,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17487,8 +16411,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17504,8 +16426,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17641,9 +16561,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17669,10 +16586,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17702,10 +16615,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17756,8 +16665,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17773,8 +16680,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17910,9 +16815,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17941,10 +16843,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17973,10 +16871,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18033,8 +16927,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18053,8 +16945,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18200,9 +17090,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18231,10 +17118,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18265,10 +17148,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18325,8 +17204,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18345,8 +17222,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18498,9 +17373,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18529,10 +17401,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18563,10 +17431,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18623,8 +17487,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18643,8 +17505,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18799,9 +17659,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18830,10 +17687,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18866,10 +17719,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18926,8 +17775,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18946,8 +17793,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19110,9 +17955,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19141,10 +17983,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19177,10 +18015,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19237,8 +18071,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19257,8 +18089,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19421,9 +18251,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19452,10 +18279,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19486,10 +18309,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19546,8 +18365,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19566,8 +18383,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19721,9 +18536,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19752,10 +18564,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19786,10 +18594,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19846,8 +18650,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19866,8 +18668,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20019,9 +18819,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20050,10 +18847,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20086,10 +18879,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20146,8 +18935,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20166,8 +18953,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20330,9 +19115,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20361,10 +19143,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20397,10 +19175,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20457,8 +19231,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20477,8 +19249,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20641,9 +19411,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20672,10 +19439,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20708,10 +19471,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20768,8 +19527,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20788,8 +19545,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20952,9 +19707,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20983,10 +19735,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21019,10 +19767,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21079,8 +19823,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21099,8 +19841,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21263,9 +20003,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21294,10 +20031,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21330,10 +20063,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21390,8 +20119,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21410,8 +20137,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21572,9 +20297,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21603,10 +20325,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21639,10 +20357,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21699,8 +20413,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21719,8 +20431,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21883,9 +20593,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21914,10 +20621,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21950,10 +20653,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22010,8 +20709,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22030,8 +20727,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22194,9 +20889,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22225,10 +20917,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22261,10 +20949,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22321,8 +21005,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22341,8 +21023,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 8d14f92d9806e3..8a02ad5dfdb7b0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_agent_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -241,9 +238,6 @@ define amdgpu_kernel void @global_agent_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -443,9 +437,6 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -661,9 +652,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -888,9 +876,6 @@ define amdgpu_kernel void @global_agent_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1060,9 +1045,6 @@ define amdgpu_kernel void @global_agent_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1233,9 +1215,6 @@ define amdgpu_kernel void @global_agent_release_store(
 ;
 ; GFX7-LABEL: global_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1432,9 +1411,6 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1630,9 +1606,6 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1793,9 +1766,6 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1982,9 +1952,6 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2172,9 +2139,6 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2389,9 +2353,6 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2606,9 +2567,6 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2812,9 +2770,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3048,9 +3003,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3284,9 +3236,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3515,9 +3464,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3772,9 +3718,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4030,9 +3973,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4315,9 +4255,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4599,9 +4536,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4857,9 +4791,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5116,9 +5047,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5401,9 +5329,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5686,9 +5611,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5971,9 +5893,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6256,9 +6175,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6541,9 +6457,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6826,9 +6739,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7111,9 +7021,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7396,9 +7303,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7657,9 +7561,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7934,9 +7835,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8222,9 +8120,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8530,9 +8425,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8837,9 +8729,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9118,9 +9007,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9396,9 +9282,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9704,9 +9587,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10012,9 +9892,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10320,9 +10197,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10628,9 +10502,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10932,9 +10803,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11240,9 +11108,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11548,9 +11413,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11854,9 +11716,6 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12055,9 +11914,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12257,9 +12113,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12475,9 +12328,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12702,9 +12552,6 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12874,9 +12721,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13047,9 +12891,6 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13246,9 +13087,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13444,9 +13282,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13607,9 +13442,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13796,9 +13628,6 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13986,9 +13815,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14203,9 +14029,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14420,9 +14243,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14626,9 +14446,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14862,9 +14679,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15098,9 +14912,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15329,9 +15140,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15586,9 +15394,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15844,9 +15649,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16129,9 +15931,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16413,9 +16212,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16671,9 +16467,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16930,9 +16723,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17215,9 +17005,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17500,9 +17287,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17785,9 +17569,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18070,9 +17851,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18355,9 +18133,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18640,9 +18415,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18925,9 +18697,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19210,9 +18979,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19471,9 +19237,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19749,9 +19512,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20057,9 +19817,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20364,9 +20121,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20645,9 +20399,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20923,9 +20674,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21231,9 +20979,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21539,9 +21284,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21847,9 +21589,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22155,9 +21894,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22459,9 +22195,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22767,9 +22500,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23075,9 +22805,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 1069cb6f0135d9..14f1734235673a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -35,9 +35,6 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ;
 ; GFX7-LABEL: global_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -245,9 +242,6 @@ define amdgpu_kernel void @global_nontemporal_load_1(
 ;
 ; GFX7-LABEL: global_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -512,9 +506,6 @@ define amdgpu_kernel void @global_nontemporal_store_0(
 ;
 ; GFX7-LABEL: global_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -717,9 +708,6 @@ define amdgpu_kernel void @global_nontemporal_store_1(
 ;
 ; GFX7-LABEL: global_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -973,9 +961,6 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: global_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index bf4d77ad61c6b2..33aaeebf658dd6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -241,9 +238,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -442,9 +436,6 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -643,9 +634,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -837,9 +825,6 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1009,9 +994,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1181,9 +1163,6 @@ define amdgpu_kernel void @global_singlethread_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1353,9 +1332,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1525,9 +1501,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1686,9 +1659,6 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1847,9 +1817,6 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2008,9 +1975,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2169,9 +2133,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2332,9 +2293,6 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2521,9 +2479,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2710,9 +2665,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2901,9 +2853,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3130,9 +3079,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3359,9 +3305,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3588,9 +3531,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3817,9 +3757,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4046,9 +3983,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4275,9 +4209,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4504,9 +4435,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4733,9 +4661,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4962,9 +4887,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5191,9 +5113,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5420,9 +5339,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5649,9 +5565,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5878,9 +5791,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6107,9 +6017,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6339,9 +6246,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6599,9 +6503,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6859,9 +6760,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7119,9 +7017,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7379,9 +7274,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7639,9 +7531,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7899,9 +7788,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8159,9 +8045,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8419,9 +8302,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8679,9 +8559,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8939,9 +8816,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9199,9 +9073,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9459,9 +9330,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9719,9 +9587,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9979,9 +9844,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10239,9 +10101,6 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10440,9 +10299,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10641,9 +10497,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10842,9 +10695,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11036,9 +10886,6 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11208,9 +11055,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11380,9 +11224,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11552,9 +11393,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11724,9 +11562,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11885,9 +11720,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12046,9 +11878,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12207,9 +12036,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12368,9 +12194,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12531,9 +12354,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12720,9 +12540,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12909,9 +12726,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13100,9 +12914,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13329,9 +13140,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13558,9 +13366,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13787,9 +13592,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14016,9 +13818,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14245,9 +14044,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14474,9 +14270,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14703,9 +14496,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14932,9 +14722,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15161,9 +14948,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15390,9 +15174,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15619,9 +15400,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15848,9 +15626,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16077,9 +15852,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16306,9 +16078,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16538,9 +16307,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16798,9 +16564,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17058,9 +16821,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17318,9 +17078,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17578,9 +17335,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17838,9 +17592,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18098,9 +17849,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18358,9 +18106,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18618,9 +18363,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18878,9 +18620,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19138,9 +18877,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19398,9 +19134,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19658,9 +19391,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19918,9 +19648,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20178,9 +19905,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index ecef93dcd84d90..ae5ec082024fd6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_system_unordered_load(
 ;
 ; GFX7-LABEL: global_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -241,9 +238,6 @@ define amdgpu_kernel void @global_system_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -443,9 +437,6 @@ define amdgpu_kernel void @global_system_acquire_load(
 ;
 ; GFX7-LABEL: global_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -663,9 +654,6 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -892,9 +880,6 @@ define amdgpu_kernel void @global_system_unordered_store(
 ;
 ; GFX7-LABEL: global_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1064,9 +1049,6 @@ define amdgpu_kernel void @global_system_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1237,9 +1219,6 @@ define amdgpu_kernel void @global_system_release_store(
 ;
 ; GFX7-LABEL: global_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1438,9 +1417,6 @@ define amdgpu_kernel void @global_system_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1638,9 +1614,6 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1801,9 +1774,6 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1992,9 +1962,6 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2184,9 +2151,6 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2405,9 +2369,6 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2626,9 +2587,6 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2834,9 +2792,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3074,9 +3029,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3314,9 +3266,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3545,9 +3494,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3804,9 +3750,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4064,9 +4007,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4353,9 +4293,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4641,9 +4578,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4901,9 +4835,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5162,9 +5093,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5451,9 +5379,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5740,9 +5665,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6029,9 +5951,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6318,9 +6237,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6579,9 +6495,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6859,9 +6772,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7171,9 +7081,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7482,9 +7389,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7765,9 +7669,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8045,9 +7946,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8357,9 +8255,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8669,9 +8564,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8981,9 +8873,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9293,9 +9182,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9601,9 +9487,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9913,9 +9796,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10225,9 +10105,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10535,9 +10412,6 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10736,9 +10610,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10938,9 +10809,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11158,9 +11026,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11387,9 +11252,6 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11559,9 +11421,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11732,9 +11591,6 @@ define amdgpu_kernel void @global_system_one_as_release_store(
 ;
 ; GFX7-LABEL: global_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11933,9 +11789,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12133,9 +11986,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12296,9 +12146,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12487,9 +12334,6 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12679,9 +12523,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12900,9 +12741,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13121,9 +12959,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13329,9 +13164,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13569,9 +13401,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13809,9 +13638,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14040,9 +13866,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14299,9 +14122,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14559,9 +14379,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14848,9 +14665,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15136,9 +14950,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15396,9 +15207,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15657,9 +15465,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15946,9 +15751,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16235,9 +16037,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16524,9 +16323,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16813,9 +16609,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17102,9 +16895,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17391,9 +17181,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17680,9 +17467,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17969,9 +17753,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18230,9 +18011,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18509,9 +18287,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18799,9 +18574,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19111,9 +18883,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19422,9 +19191,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19705,9 +19471,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19985,9 +19748,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20297,9 +20057,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20609,9 +20366,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20921,9 +20675,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21233,9 +20984,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21541,9 +21289,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21853,9 +21598,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22165,9 +21907,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index eea71794e549e8..d916ff533e77bd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -36,9 +36,6 @@ define amdgpu_kernel void @global_volatile_load_0(
 ;
 ; GFX7-LABEL: global_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -195,9 +192,6 @@ define amdgpu_kernel void @global_volatile_load_1(
 ;
 ; GFX7-LABEL: global_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -395,9 +389,6 @@ define amdgpu_kernel void @global_volatile_store_0(
 ;
 ; GFX7-LABEL: global_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -563,9 +554,6 @@ define amdgpu_kernel void @global_volatile_store_1(
 ;
 ; GFX7-LABEL: global_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -769,9 +757,6 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -915,9 +900,6 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 95bc4ddd0cff74..aaa11c0455606f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -241,9 +238,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -442,9 +436,6 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -643,9 +634,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -837,9 +825,6 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1009,9 +994,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1181,9 +1163,6 @@ define amdgpu_kernel void @global_wavefront_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1353,9 +1332,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1525,9 +1501,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1686,9 +1659,6 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1847,9 +1817,6 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2008,9 +1975,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2169,9 +2133,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2332,9 +2293,6 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2521,9 +2479,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2710,9 +2665,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2901,9 +2853,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3130,9 +3079,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3359,9 +3305,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3588,9 +3531,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3817,9 +3757,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4046,9 +3983,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4275,9 +4209,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4504,9 +4435,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4733,9 +4661,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4962,9 +4887,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5191,9 +5113,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5420,9 +5339,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5649,9 +5565,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5878,9 +5791,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6107,9 +6017,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6339,9 +6246,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6599,9 +6503,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6859,9 +6760,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7119,9 +7017,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7379,9 +7274,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7639,9 +7531,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7899,9 +7788,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8159,9 +8045,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8419,9 +8302,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8679,9 +8559,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8939,9 +8816,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9199,9 +9073,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9459,9 +9330,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9719,9 +9587,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9979,9 +9844,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10239,9 +10101,6 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10440,9 +10299,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10641,9 +10497,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10842,9 +10695,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11036,9 +10886,6 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11208,9 +11055,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11380,9 +11224,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11552,9 +11393,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11724,9 +11562,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11885,9 +11720,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12046,9 +11878,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12207,9 +12036,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12368,9 +12194,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12531,9 +12354,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12720,9 +12540,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12909,9 +12726,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13100,9 +12914,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13329,9 +13140,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13558,9 +13366,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13787,9 +13592,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14016,9 +13818,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14245,9 +14044,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14474,9 +14270,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14703,9 +14496,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14932,9 +14722,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15161,9 +14948,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15390,9 +15174,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15619,9 +15400,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15848,9 +15626,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16077,9 +15852,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16306,9 +16078,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16538,9 +16307,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16798,9 +16564,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17058,9 +16821,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17318,9 +17078,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17578,9 +17335,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17838,9 +17592,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18098,9 +17849,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18358,9 +18106,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18618,9 +18363,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18878,9 +18620,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19138,9 +18877,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19398,9 +19134,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19658,9 +19391,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19918,9 +19648,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20178,9 +19905,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 28c6f255e86a8e..26511f079fa8f8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -241,9 +238,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -442,9 +436,6 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -649,9 +640,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -859,9 +847,6 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1031,9 +1016,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1204,9 +1186,6 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1395,9 +1374,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1585,9 +1561,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1746,9 +1719,6 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1918,9 +1888,6 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2098,9 +2065,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2288,9 +2252,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2479,9 +2440,6 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2674,9 +2632,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2889,9 +2844,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3105,9 +3057,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3334,9 +3283,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3574,9 +3520,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3822,9 +3765,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4080,9 +4020,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4337,9 +4274,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4576,9 +4510,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4816,9 +4747,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5074,9 +5002,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5332,9 +5257,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5590,9 +5512,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5848,9 +5767,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6106,9 +6022,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6364,9 +6277,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6622,9 +6532,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6882,9 +6789,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7142,9 +7046,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7408,9 +7309,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7687,9 +7585,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7973,9 +7868,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8258,9 +8150,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8525,9 +8414,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8791,9 +8677,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9077,9 +8960,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9363,9 +9243,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9649,9 +9526,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9935,9 +9809,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10219,9 +10090,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10505,9 +10373,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10791,9 +10656,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11076,9 +10938,6 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11277,9 +11136,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11478,9 +11334,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11684,9 +11537,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11891,9 +11741,6 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12063,9 +11910,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12235,9 +12079,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12418,9 +12259,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12601,9 +12439,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12762,9 +12597,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12933,9 +12765,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13105,9 +12934,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13287,9 +13113,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13471,9 +13294,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13665,9 +13485,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13872,9 +13689,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14081,9 +13895,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14310,9 +14121,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14549,9 +14357,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14789,9 +14594,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15039,9 +14841,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15289,9 +15088,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15528,9 +15324,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15767,9 +15560,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16017,9 +15807,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16267,9 +16054,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16517,9 +16301,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16767,9 +16548,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17017,9 +16795,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17267,9 +17042,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17517,9 +17289,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17770,9 +17539,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18030,9 +17796,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18295,9 +18058,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18566,9 +18326,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18844,9 +18601,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19122,9 +18876,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19389,9 +19140,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19654,9 +19402,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19932,9 +19677,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20210,9 +19952,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20488,9 +20227,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20766,9 +20502,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21042,9 +20775,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21320,9 +21050,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21598,9 +21325,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 24598bcbea5bba..fce60ff12aed3d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -37,9 +37,6 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ;
 ; GFX7-LABEL: local_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -243,9 +240,6 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ;
 ; GFX7-LABEL: local_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -901,9 +895,6 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: local_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 455834572a59ac..a8f7051bd5050c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -33,9 +33,6 @@ define amdgpu_kernel void @local_volatile_load_0(
 ;
 ; GFX7-LABEL: local_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -183,9 +180,6 @@ define amdgpu_kernel void @local_volatile_load_1(
 ;
 ; GFX7-LABEL: local_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 55c40ee491100e..c3599c87985bec 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -37,10 +37,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX7-LABEL: private_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -56,7 +53,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -72,7 +69,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -113,7 +110,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -129,7 +126,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -251,10 +248,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX7-LABEL: private_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -272,7 +266,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -290,7 +284,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -335,7 +329,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -356,7 +350,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -513,7 +507,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX7-LABEL: private_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -528,7 +522,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -544,7 +538,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -579,7 +573,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -595,7 +589,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -711,7 +705,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX7-LABEL: private_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -728,7 +722,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -744,7 +738,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -781,7 +775,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -800,7 +794,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -951,10 +945,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: private_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -970,7 +961,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -986,7 +977,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1027,7 +1018,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1043,7 +1034,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index cdb1b463ac321e..9146f175eefcd1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -37,10 +37,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX7-LABEL: private_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -56,7 +53,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -72,7 +69,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -201,10 +198,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX7-LABEL: private_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -222,7 +216,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -240,7 +234,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -392,7 +386,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX7-LABEL: private_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -408,7 +402,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -425,7 +419,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -555,7 +549,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX7-LABEL: private_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -573,7 +567,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -590,7 +584,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 86cdf3ccd04415..02ce58dd754032 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -34,13 +34,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -59,13 +56,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -152,9 +146,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_imin_sle_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -166,9 +157,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_imin_sle_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -230,9 +218,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ; CI-LABEL: s_test_imin_sle_v1i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -244,9 +229,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ; VI-LABEL: s_test_imin_sle_v1i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -310,9 +292,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ;
 ; CI-LABEL: s_test_imin_sle_v4i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -331,9 +310,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ;
 ; VI-LABEL: s_test_imin_sle_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -446,14 +422,11 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i8 s2, s2
 ; CI-NEXT:    s_sext_i32_i8 s3, s3
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_byte v[0:1], v2
@@ -464,14 +437,11 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i8 s2, s2
 ; VI-NEXT:    s_sext_i32_i8 s3, s3
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -589,8 +559,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s4, s2, 24
 ; CI-NEXT:    s_sext_i32_i8 s5, s2
@@ -614,7 +582,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; CI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; CI-NEXT:    s_or_b32 s2, s3, s2
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -624,8 +591,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s4, s2, 24
 ; VI-NEXT:    s_bfe_i32 s5, s2, 0x80010
@@ -649,7 +614,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_or_b32 s2, s2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -804,9 +768,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; CI-LABEL: s_test_imin_sle_v2i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s4, s2, 16
 ; CI-NEXT:    s_sext_i32_i16 s2, s2
@@ -826,9 +787,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; VI-LABEL: s_test_imin_sle_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s4, s2, 16
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
@@ -963,9 +921,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s6, s0, 16
 ; CI-NEXT:    s_ashr_i32 s7, s1, 16
@@ -996,9 +951,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s6, s1, 16
 ; VI-NEXT:    s_sext_i32_i16 s1, s1
@@ -1097,13 +1049,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1122,13 +1071,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1244,13 +1190,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1269,13 +1212,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1363,9 +1303,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_imin_slt_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1377,9 +1314,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_imin_slt_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1443,9 +1377,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s1, s1, s3
 ; CI-NEXT:    s_min_i32 s0, s0, s2
@@ -1460,9 +1391,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s1, s1, s3
 ; VI-NEXT:    s_min_i32 s0, s0, s2
@@ -1537,9 +1465,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1552,9 +1477,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1622,9 +1544,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1637,9 +1556,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1719,13 +1635,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1744,13 +1657,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1856,15 +1766,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v6
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v2, s5
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, s4, v6
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; CI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; CI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
@@ -1883,15 +1790,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v6
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; VI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
@@ -2025,15 +1929,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
@@ -2064,15 +1965,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
@@ -2171,9 +2069,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_umin_ule_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2185,9 +2080,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_umin_ule_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2262,13 +2154,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2287,13 +2176,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2399,9 +2285,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s3
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
@@ -2423,9 +2306,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v0
@@ -2513,9 +2393,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_umin_ult_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2527,9 +2404,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_umin_ult_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2613,9 +2487,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; CI-LABEL: v_test_umin_ult_i32_multi_use:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s5, s[6:7], 0x0
@@ -2637,9 +2508,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; VI-LABEL: v_test_umin_ult_i32_multi_use:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s5, s[6:7], 0x0
@@ -2769,9 +2637,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; CI-LABEL: v_test_umin_ult_i16_multi_use:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
@@ -2794,9 +2659,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; VI-LABEL: v_test_umin_ult_i16_multi_use:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
@@ -2889,9 +2751,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ; CI-LABEL: s_test_umin_ult_v1i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2903,9 +2762,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ; VI-LABEL: s_test_umin_ult_v1i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2977,9 +2833,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ;
 ; CI-LABEL: s_test_umin_ult_v8i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x8
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3011,9 +2864,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ;
 ; VI-LABEL: s_test_umin_ult_v8i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3274,9 +3124,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ;
 ; CI-LABEL: s_test_umin_ult_v8i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3323,9 +3170,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ;
 ; VI-LABEL: s_test_umin_ult_v8i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3451,14 +3295,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; CI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -3469,14 +3310,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3566,14 +3404,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i16 s2, s2
 ; CI-NEXT:    s_sext_i32_i16 s3, s3
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -3584,14 +3419,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3688,9 +3520,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i16 s3, s2
 ; CI-NEXT:    s_ashr_i32 s2, s2, 16
@@ -3705,9 +3534,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i16 s3, s2
 ; VI-NEXT:    s_ashr_i32 s2, s2, 16
@@ -3790,9 +3616,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3811,9 +3634,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3906,9 +3726,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3927,9 +3744,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4022,9 +3836,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4043,9 +3854,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4138,9 +3946,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4159,9 +3964,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4278,12 +4080,9 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
@@ -4312,13 +4111,10 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4436,12 +4232,9 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
@@ -4469,13 +4262,10 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index d7814c52828b88..5792fab7011afe 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -180,9 +180,6 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -263,9 +260,6 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -347,9 +341,6 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -412,9 +403,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -477,9 +465,6 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -542,9 +527,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -606,9 +588,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 3de6945f95556b..529e64715500dd 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -176,9 +176,6 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1)
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -257,9 +254,6 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -339,9 +333,6 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -402,9 +393,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -466,9 +454,6 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -529,9 +514,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 993eb11ef5e953..5a8d0f3d0f158c 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -10,36 +10,36 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908: bb.0 (%ir-block.0):
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %27
-  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %27
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %24
-  ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %26
+  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %26
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23
+  ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX908-LABEL: name: partial_copy
   ; PEI-GFX908: bb.0 (%ir-block.0):
-  ; PEI-GFX908-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+  ; PEI-GFX908-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
   ; PEI-GFX908-NEXT: {{  $}}
-  ; PEI-GFX908-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
-  ; PEI-GFX908-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-  ; PEI-GFX908-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+  ; PEI-GFX908-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+  ; PEI-GFX908-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+  ; PEI-GFX908-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
   ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
@@ -48,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -59,34 +59,34 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A: bb.0 (%ir-block.0):
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %26
-  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %26
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %24
-  ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %25
+  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %25
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23
+  ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX90A-LABEL: name: partial_copy
   ; PEI-GFX90A: bb.0 (%ir-block.0):
-  ; PEI-GFX90A-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+  ; PEI-GFX90A-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
   ; PEI-GFX90A-NEXT: {{  $}}
-  ; PEI-GFX90A-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
-  ; PEI-GFX90A-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-  ; PEI-GFX90A-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+  ; PEI-GFX90A-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+  ; PEI-GFX90A-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+  ; PEI-GFX90A-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
   ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -94,7 +94,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 847c2d343d4153..c2132cf907fdb2 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -6,9 +6,6 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -36,12 +33,9 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x5a
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_sad_u32 v2, s2, v0, 20
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -63,9 +57,6 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -92,9 +83,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -103,7 +92,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_add_i32 s0, s0, s2
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -131,15 +119,12 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_sad_u32 v2, s0, v2, v3
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
@@ -166,9 +151,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s3, s0, s1
@@ -179,7 +162,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -204,9 +186,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -217,7 +197,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -243,9 +222,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s3, s0, s1
@@ -256,7 +233,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -279,9 +255,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -290,7 +264,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_add_i32 s0, s0, s2
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -312,9 +285,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; GCN-LABEL: v_sad_u32_vector_pat1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0xc
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -351,9 +321,6 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; GCN-LABEL: v_sad_u32_vector_pat2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0xc
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -391,8 +358,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 ; GCN-NEXT:    s_load_dword s4, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 16
@@ -400,7 +365,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_sad_u32 v2, s4, v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -420,9 +384,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sad_u32_i16_pat2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_ushort v0, v[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -455,9 +416,6 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s2, 0xff
 ; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
@@ -485,9 +443,6 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
 define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sad_u32_i8_pat2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -520,9 +475,6 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s2, 0xff
 ; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
@@ -550,9 +502,6 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s6, s0, s1
 ; GCN-NEXT:    s_cmp_le_u32 s0, s1
@@ -582,9 +531,6 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s3, s0, s3
 ; GCN-NEXT:    s_sub_i32 s6, s1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 9826585df8bd80..89a09dc4fcc171 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -9,8 +9,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -26,8 +24,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX906-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX906-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX906-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -43,8 +39,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX908-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX908-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -61,8 +55,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
-; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -96,8 +88,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -113,8 +103,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX906-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX906-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX906-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -130,8 +118,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX908-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX908-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -148,8 +134,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
-; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 5d0ddcc7114c26..5dcb4b7c979ab1 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -24,179 +24,175 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    v_writelane_b32 v22, s2, 0
 ; CHECK-NEXT:    v_writelane_b32 v22, s3, 1
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[48:51]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:11]
+; CHECK-NEXT:    ; def s[4:7]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v23, s4, 2
 ; CHECK-NEXT:    v_writelane_b32 v23, s5, 3
 ; CHECK-NEXT:    v_writelane_b32 v23, s6, 4
 ; CHECK-NEXT:    v_writelane_b32 v23, s7, 5
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 6
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 7
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 8
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 9
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[4:11]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:19]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 10
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 11
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 12
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 13
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 14
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 15
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 16
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 17
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 18
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 19
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 20
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 21
-; CHECK-NEXT:    v_writelane_b32 v23, s16, 22
-; CHECK-NEXT:    v_writelane_b32 v23, s17, 23
-; CHECK-NEXT:    v_writelane_b32 v23, s18, 24
-; CHECK-NEXT:    v_writelane_b32 v23, s19, 25
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 14
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 15
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 16
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 17
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 18
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 19
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 20
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 21
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 22
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 23
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 24
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 25
+; CHECK-NEXT:    v_writelane_b32 v23, s16, 26
+; CHECK-NEXT:    v_writelane_b32 v23, s17, 27
+; CHECK-NEXT:    v_writelane_b32 v23, s18, 28
+; CHECK-NEXT:    v_writelane_b32 v23, s19, 29
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[38:39]
+; CHECK-NEXT:    ; def s[42:43]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[44:47]
+; CHECK-NEXT:    ; def s[52:55]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 26
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 27
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 28
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 29
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 30
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 31
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 32
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 33
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 30
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 31
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 32
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 33
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 34
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 35
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 36
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 37
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[36:37]
+; CHECK-NEXT:    ; def s[40:41]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[40:43]
+; CHECK-NEXT:    ; def s[36:39]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
+; CHECK-NEXT:    ; def s[44:51]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 34
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 35
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 36
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 37
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 38
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 39
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 40
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 42
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 43
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 44
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 45
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 46
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 47
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 48
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 49
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 50
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 51
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 52
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 53
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 54
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 55
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 56
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 57
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 38
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 39
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 40
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 41
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 42
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 43
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 44
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 45
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 46
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 47
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 48
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 49
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 50
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 51
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 52
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 53
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 54
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 55
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 56
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 57
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[0:7]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v23, s0, 58
 ; CHECK-NEXT:    v_writelane_b32 v23, s1, 59
 ; CHECK-NEXT:    v_writelane_b32 v23, s2, 60
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    v_writelane_b32 v23, s3, 61
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 62
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 4
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 63
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 5
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 62
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 0
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 63
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 17
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 19
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 4
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 5
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s8, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s9, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s10, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s11, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s12, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s13, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s14, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s15, 17
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 23
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 19
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 23
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[0:7]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v0, s0, 24
 ; CHECK-NEXT:    v_writelane_b32 v0, s1, 25
 ; CHECK-NEXT:    v_writelane_b32 v0, s2, 26
 ; CHECK-NEXT:    v_writelane_b32 v0, s3, 27
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 31
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 35
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 31
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 36
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 37
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 38
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 39
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 40
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 41
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 42
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 43
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 44
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 45
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 46
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 47
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 48
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 49
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 50
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 51
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 35
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 36
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 37
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 38
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 39
+; CHECK-NEXT:    v_writelane_b32 v0, s8, 40
+; CHECK-NEXT:    v_writelane_b32 v0, s9, 41
+; CHECK-NEXT:    v_writelane_b32 v0, s10, 42
+; CHECK-NEXT:    v_writelane_b32 v0, s11, 43
+; CHECK-NEXT:    v_writelane_b32 v0, s12, 44
+; CHECK-NEXT:    v_writelane_b32 v0, s13, 45
+; CHECK-NEXT:    v_writelane_b32 v0, s14, 46
+; CHECK-NEXT:    v_writelane_b32 v0, s15, 47
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %ret
 ; CHECK-NEXT:    s_endpgm
@@ -210,170 +206,166 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
 ; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
 ; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 6
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 7
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 8
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 9
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[48:51]
+; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 6
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 7
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 8
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 9
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 10
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 11
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 12
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 10
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 11
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 12
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 13
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 14
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 15
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 16
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 17
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 18
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 19
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 20
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 21
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 22
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 23
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 24
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 25
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 14
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 15
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 16
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 17
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 18
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 19
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 20
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 21
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 22
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 23
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 24
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 25
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 26
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 27
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 28
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 29
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 26
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 27
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 28
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 29
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 30
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 31
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 32
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 33
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 30
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 31
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 32
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 33
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 34
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 35
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 36
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 37
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[38:39]
+; CHECK-NEXT:    ; use s[42:43]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[44:47]
+; CHECK-NEXT:    ; use s[52:55]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 34
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 35
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 36
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 37
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 38
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 39
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 40
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 41
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 38
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 39
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 40
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[36:37]
+; CHECK-NEXT:    ; use s[40:41]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[40:43]
+; CHECK-NEXT:    ; use s[36:39]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:7]
+; CHECK-NEXT:    ; use s[44:51]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 42
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 43
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 44
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 45
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 46
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 47
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 48
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 49
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 50
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 51
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 52
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 53
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 54
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 55
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 56
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 57
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 42
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 43
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 44
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 45
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 46
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 47
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 48
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 49
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 50
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 51
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 52
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 53
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 58
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 59
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 60
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 61
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 54
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 55
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 56
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 62
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 63
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 0
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 1
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 3
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 58
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 59
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 60
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 61
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 62
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 63
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 0
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 17
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 19
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 2
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 3
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 6
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 8
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s8, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s9, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s10, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s11, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s12, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s13, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s14, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s15, v0, 17
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 19
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 23
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:3]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_readlane_b32 s0, v0, 24
 ; CHECK-NEXT:    v_readlane_b32 s1, v0, 25
 ; CHECK-NEXT:    v_readlane_b32 s2, v0, 26
 ; CHECK-NEXT:    v_readlane_b32 s3, v0, 27
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 31
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 31
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 39
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 40
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 41
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 42
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 43
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 44
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 45
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 46
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 47
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 48
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 49
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 50
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 51
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 36
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 37
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 38
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 39
+; CHECK-NEXT:    v_readlane_b32 s8, v0, 40
+; CHECK-NEXT:    v_readlane_b32 s9, v0, 41
+; CHECK-NEXT:    v_readlane_b32 s10, v0, 42
+; CHECK-NEXT:    v_readlane_b32 s11, v0, 43
+; CHECK-NEXT:    v_readlane_b32 s12, v0, 44
+; CHECK-NEXT:    v_readlane_b32 s13, v0, 45
+; CHECK-NEXT:    v_readlane_b32 s14, v0, 46
+; CHECK-NEXT:    v_readlane_b32 s15, v0, 47
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index a51b0128a3a4a4..59036c64c8afcc 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -7,7 +7,7 @@
 define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 {
 ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s13
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_load_dword s4, s[6:7], 0x2
 ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index f69b7ae1051240..ebc916b5c889b5 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -182,10 +182,8 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_shl_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -205,7 +203,6 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -218,10 +215,8 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_lshr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -241,7 +236,6 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -254,10 +248,8 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_ashr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, 64, s4
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
@@ -278,7 +270,6 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -439,9 +430,6 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_shl_v2i128ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -514,9 +502,6 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_lshr_v2i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -589,9 +574,6 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_ashr_v2i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index a38f0a6d86b8ca..b8721129222043 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -9,9 +9,6 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -23,9 +20,6 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -44,14 +38,11 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -61,14 +52,11 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -84,9 +72,6 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitcmp1_b32 s2, 0
 ; CI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -101,9 +86,6 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -122,9 +104,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; CI-LABEL: s_sint_to_fp_i64_to_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s3
 ; CI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -138,9 +117,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-LABEL: s_sint_to_fp_i64_to_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -160,9 +136,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -182,9 +155,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -213,9 +183,6 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i8 s2, s2
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
@@ -228,9 +195,6 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bfe_i32 s2, s2, 0x80000
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
@@ -268,14 +232,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -285,14 +246,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -325,14 +283,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -342,14 +297,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -401,14 +353,11 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0, 0xbff00000
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -418,14 +367,11 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0, 0xbff00000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 0a6009c0e7da8a..b4a981f1db4ec7 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %31.sub0
-  ; GCN-NEXT:   SI_SPILL_V64_SAVE %31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0
+  ; GCN-NEXT:   SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]]
   ; GCN-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index b2e334c66ccd25..3644bef9c20a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -50,10 +50,7 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
 define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ; HAWAII-LABEL: local_store_i55:
 ; HAWAII:       ; %bb.0:
-; HAWAII-NEXT:    s_add_i32 s10, s10, s15
 ; HAWAII-NEXT:    s_or_b32 s0, s6, 14
-; HAWAII-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; HAWAII-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
 ; HAWAII-NEXT:    v_mov_b32_e32 v1, s7
 ; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -73,10 +70,7 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ;
 ; FIJI-LABEL: local_store_i55:
 ; FIJI:       ; %bb.0:
-; FIJI-NEXT:    s_add_i32 s10, s10, s15
 ; FIJI-NEXT:    s_or_b32 s0, s6, 14
-; FIJI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; FIJI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; FIJI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIJI-NEXT:    v_mov_b32_e32 v1, s7
 ; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 30accc846d2b6a..19d633651fdd0d 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
+; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     9
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 4f84b31f1877b0..2097579e0c9959 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
+; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000  ................
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     5
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 0b1bd11b88d5d6..775c62e73261a9 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefixes=ASM %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s
@@ -7,17 +6,15 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM:       ; %bb.0: ; %entry
-; ASM-NEXT:    ;;#ASMSTART
-; ASM-NEXT:    ;;#ASMEND
-; ASM-NEXT:    s_endpgm
+; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
 ; OBJ: Contents of section .rodata:
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000  @.......!.......
+; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     9
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 2516177691ce3c..4dfd4c095c87a0 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -23,14 +23,11 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-TRAP-GFX803-LABEL: trap:
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s2
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s3
-; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    s_trap 2
@@ -124,9 +121,6 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-TRAP-GFX803-LABEL: non_entry_trap:
 ; HSA-TRAP-GFX803:       ; %bb.0: ; %entry
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
@@ -288,9 +282,6 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s4
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s5
@@ -423,13 +414,10 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-TRAP-GFX803-LABEL: debugtrap:
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v3, 2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v3, 2
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 317e350f3eafe5..735956caa72da4 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -81,9 +81,6 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-LABEL: udiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -255,9 +252,6 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GCN-LABEL: s_udiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GCN-NEXT:    s_sub_i32 s4, 0, s3
@@ -463,9 +457,6 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: udiv_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -819,9 +810,6 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: udiv_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -1147,9 +1135,6 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
 ; GCN-LABEL: udiv_i32_div_pow2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1239,9 +1224,6 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: udiv_i32_div_k_even:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1336,9 +1318,6 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: udiv_i32_div_k_odd:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1451,9 +1430,6 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-LABEL: v_udiv_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1594,9 +1570,6 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1753,9 +1726,6 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 4
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -1953,9 +1923,6 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i24:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 4
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -2138,9 +2105,6 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
 ; GCN-LABEL: scalarize_mulhu_4xi32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -2254,9 +2218,6 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
 ; GCN-LABEL: test_udiv2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -2320,9 +2281,6 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
@@ -2413,9 +2371,6 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
 ;
 ; GCN-LABEL: fdiv_test_denormals:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_sbyte v2, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 44b16d7f65dc55..d00ea6dff24474 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,9 +9,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -31,9 +28,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -60,9 +54,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; SI-LABEL: s_uint_to_fp_i64_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -76,9 +67,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-LABEL: s_uint_to_fp_i64_to_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -98,9 +86,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
@@ -118,9 +103,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
@@ -144,9 +126,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) {
 ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x8
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -179,9 +158,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
 ;
 ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -220,9 +196,6 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
@@ -234,9 +207,6 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -252,9 +222,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
 ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[2:3], s3
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -272,9 +239,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
@@ -295,9 +259,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
@@ -325,14 +286,11 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -342,14 +300,11 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -365,9 +320,6 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bitcmp1_b32 s2, 0
 ; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -382,9 +334,6 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -404,9 +353,6 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_and_b32 s2, s2, 0xff
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -419,9 +365,6 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -459,14 +402,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -476,14 +416,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -516,14 +453,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -533,14 +467,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -574,14 +505,11 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -591,14 +519,11 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 43a948b6c6ab27..a827ebe96cfcf4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -11,7 +11,7 @@
 define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s13
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 33f629a3c4f0cb..4545c8bbeb3e6c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -25,9 +25,8 @@
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr8' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
@@ -41,7 +40,7 @@
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: body:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 1fbd3760eed26c..8215ba834170f2 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -25,9 +25,8 @@
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr8' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
@@ -41,7 +40,7 @@
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: body:

>From 3e84d162d2247846e9253758e2573ca8998adf7f Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Tue, 3 Sep 2024 16:25:45 -0700
Subject: [PATCH 05/13] Fix tests after merge from main branch

---
 clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu  | 2 +-
 .../AMDGPU/attributor-flatscratchinit-globalisel.ll       | 8 ++++----
 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll    | 8 ++++----
 llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll          | 2 +-
 llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll  | 2 +-
 llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll        | 3 ++-
 llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll          | 2 +-
 7 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index b295bbbdaaf955..7f016180629e4b 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -432,7 +432,7 @@ __global__ void kernel4(struct S s) {
 // CHECK-SPIRV-NEXT:    ret void
 //
 // OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel5P1S(
-// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // OPT-NEXT:  [[ENTRY:.*:]]
 // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8
 // OPT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index b7503f26b1ab6d..9efdf3cbb86067 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -4,7 +4,7 @@
 ;; tests of alloca
 
 define void @without_alloca(i1 %arg0) {
-  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
@@ -15,7 +15,7 @@ define void @with_alloca() {
 }
 
 define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
-  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
@@ -453,13 +453,13 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
-  store volatile i32 %val, ptr addrspace(1) undef
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
 define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
-  store volatile i32 %val, ptr addrspace(1) undef
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index f04c93961b6705..d32ef070b983e4 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -10,7 +10,7 @@ define void @without_alloca(i1 %arg0) {
 ;
 ; GFX10-LABEL: define void @without_alloca(i1 %arg0)
 ; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
-  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
@@ -31,7 +31,7 @@ define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
 ; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
-  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
@@ -858,7 +858,7 @@ define void @use_intrinsic_workitem_id_x() {
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
 ; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
-  store volatile i32 %val, ptr addrspace(1) undef
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
@@ -869,7 +869,7 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
 ; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
-  store volatile i32 %val, ptr addrspace(1) undef
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index ebca9906998787..55113a3b6f263e 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() {
 }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 074dba1cbcc931..df6a75c7376df4 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -43,5 +43,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index 8562cdf195b022..4e3fc24c4d13ad 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -101,7 +101,8 @@ entry:
 
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
+; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
+; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
 ; NO: [[META0]] = !{ptr @bar1, ptr @bar2}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 183cdb2e1f8627..b5f5e3cf1a1a78 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -81,7 +81,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
 ; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.

>From bf274dc4ee02e6c24bd2855fd11d9e79970c3137 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Wed, 4 Sep 2024 14:20:24 -0700
Subject: [PATCH 06/13] Undo changes made to AMDGPUSubtarget.cpp to limit
 impact on tests. This undo is simply achieved by merging code from upstream
 because a recent commit has changed that file. The changes therein will be
 included in a separate PR.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |   4 +-
 .../annotate-kernel-features-hsa-call.ll      |  89 ++---
 .../AMDGPU/annotate-kernel-features-hsa.ll    |  34 +-
 .../AMDGPU/attributor-flatscratchinit.ll      | 372 +++++++++---------
 .../CodeGen/AMDGPU/direct-indirect-call.ll    |   2 +-
 .../AMDGPU/duplicate-attribute-indirect.ll    |   2 +-
 .../CodeGen/AMDGPU/simple-indirect-call-2.ll  |   2 +-
 7 files changed, 251 insertions(+), 254 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 03391a36c54c79..ddbbb73dd84e14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -441,7 +441,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     }
 
     for (Instruction &I : instructions(F)) {
-      if (isa<AllocaInst>(I) || isa<AddrSpaceCastInst>(I)) {
+      if (isa<AddrSpaceCastInst>(I) &&
+          cast<AddrSpaceCastInst &>(I).getSrcAddressSpace() ==
+              AMDGPUAS::PRIVATE_ADDRESS) {
         removeAssumedBits(FLAT_SCRATCH_INIT);
         return;
       }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index ab2e28e5f5cbb5..ed136c58379cb3 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -530,7 +530,7 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -545,7 +545,7 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -611,7 +611,7 @@ define void @use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -628,7 +628,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_implicitarg_ptr()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -645,7 +645,7 @@ define internal void @defined.func() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -658,7 +658,7 @@ define void @func_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -673,7 +673,7 @@ define void @func_call_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -687,8 +687,8 @@ define void @func_call_asm() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
-; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR31:[0-9]+]]
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR28:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void asm sideeffect "", ""() #3
@@ -702,7 +702,7 @@ define amdgpu_kernel void @kern_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -717,7 +717,7 @@ define amdgpu_kernel void @func_kern_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -751,7 +751,7 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @use_dispatch_ptr_ret_type()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -769,7 +769,7 @@ define float @func_indirect_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -788,7 +788,7 @@ define float @func_extern_call() #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @extern()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -806,7 +806,7 @@ define float @func_null_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float null()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -827,7 +827,7 @@ define float @func_other_intrinsic_call(float %arg) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
-; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR20]] {
+; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR17]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -845,7 +845,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -861,7 +861,7 @@ define void @func_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -877,7 +877,7 @@ define void @func_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -893,7 +893,7 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -912,7 +912,7 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR21]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @extern_func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -928,7 +928,7 @@ define internal void @enqueue_block_def() #6 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR28:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -941,7 +941,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR29:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_decl()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -995,7 +995,7 @@ define amdgpu_kernel void @kern_callsite_enqueue_block() {
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
-; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR32:[0-9]+]]
+; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR29:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void @known_func() #6
@@ -1037,27 +1037,24 @@ attributes #6 = { "enqueued-block" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR26:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR30]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR31]] = { nounwind }
-; ATTRIBUTOR_HSA: attributes #[[ATTR32]] = { "enqueued-block" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind }
+; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index ac5458f56f08b8..bab5b5c31714c6 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -442,9 +442,8 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR12]] {
-; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[PTR]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -454,7 +453,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 
 define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_group_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(3) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -466,7 +465,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_private_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -485,7 +484,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[PTR]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -502,7 +501,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -513,7 +512,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
 
 define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_global_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -525,7 +524,7 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_constant_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
 ; HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -544,7 +543,7 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -565,7 +564,7 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -585,7 +584,7 @@ define amdgpu_kernel void @use_alloca() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -606,7 +605,7 @@ define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
 ; ATTRIBUTOR_HSA-NEXT:  entry:
 ; ATTRIBUTOR_HSA-NEXT:    br label [[BB:%.*]]
 ; ATTRIBUTOR_HSA:       bb:
@@ -631,7 +630,7 @@ define void @use_alloca_func() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR14:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -664,10 +663,9 @@ attributes #1 = { nounwind }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index d32ef070b983e4..b33ec280b034c9 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -6,20 +6,20 @@
 
 define void @without_alloca(i1 %arg0) {
 ; GFX9-LABEL: define void @without_alloca(i1 %arg0)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_alloca(i1 %arg0)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI:[0-9]+]]
   store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
 define void @with_alloca() {
 ; GFX9-LABEL: define void @with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -27,20 +27,20 @@ define void @with_alloca() {
 
 define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2:[0-9]+]]
   store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
 define amdgpu_kernel void @with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -50,10 +50,10 @@ define amdgpu_kernel void @with_alloca_cc_kernel() {
 
 define amdgpu_vs void @with_alloca_cc_vs() {
 ; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -61,10 +61,10 @@ define amdgpu_vs void @with_alloca_cc_vs() {
 
 define amdgpu_gs void @with_alloca_cc_gs() {
 ; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -72,10 +72,10 @@ define amdgpu_gs void @with_alloca_cc_gs() {
 
 define amdgpu_ps void @with_alloca_cc_ps() {
 ; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -83,10 +83,10 @@ define amdgpu_ps void @with_alloca_cc_ps() {
 
 define amdgpu_cs void @with_alloca_cc_cs() {
 ; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -94,10 +94,10 @@ define amdgpu_cs void @with_alloca_cc_cs() {
 
 define amdgpu_hs void @with_alloca_cc_hs() {
 ; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -105,10 +105,10 @@ define amdgpu_hs void @with_alloca_cc_hs() {
 
 define amdgpu_ls void @with_alloca_cc_ls() {
 ; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -116,10 +116,10 @@ define amdgpu_ls void @with_alloca_cc_ls() {
 
 define amdgpu_es void @with_alloca_cc_es() {
 ; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -127,10 +127,10 @@ define amdgpu_es void @with_alloca_cc_es() {
 
 define amdgpu_gfx void @with_alloca_cc_gfx() {
 ; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -138,10 +138,10 @@ define amdgpu_gfx void @with_alloca_cc_gfx() {
 
 define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
 ; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2]]
 ;
 ; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -149,10 +149,10 @@ define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
 
 define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
 ; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2]]
 ;
 ; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -160,40 +160,40 @@ define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
 
 define void @call_without_alloca() {
 ; GFX9-LABEL: define void @call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @without_alloca(i1 true)
   ret void
 }
 
 define amdgpu_kernel void @call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @without_alloca(i1 true)
   ret void
 }
 
 define void @call_with_alloca() {
 ; GFX9-LABEL: define void @call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @with_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @with_alloca()
   ret void
 }
@@ -222,50 +222,50 @@ define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
 
 define void @call_call_without_alloca() {
 ; GFX9-LABEL: define void @call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @call_without_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @call_without_alloca()
   ret void
 }
 
 define void @call_call_with_alloca() {
 ; GFX9-LABEL: define void @call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @call_with_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @call_with_alloca()
   ret void
 }
 
 define void @with_alloca_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -274,10 +274,10 @@ define void @with_alloca_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -286,10 +286,10 @@ define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 
 define void @with_alloca_call_with_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @with_alloca()
@@ -298,10 +298,10 @@ define void @with_alloca_call_with_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @with_alloca()
@@ -310,10 +310,10 @@ define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -322,10 +322,10 @@ define void @with_alloca_call_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -334,10 +334,10 @@ define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_with_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_with_alloca()
@@ -346,10 +346,10 @@ define void @with_alloca_call_call_with_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_with_alloca()
@@ -360,30 +360,30 @@ define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -391,10 +391,10 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 
 define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -402,30 +402,30 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -433,10 +433,10 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 
 define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -444,30 +444,30 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -475,10 +475,10 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 
 define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -486,30 +486,30 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp
 
 define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -517,10 +517,10 @@ define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 
 define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -528,30 +528,30 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add
 
 define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -559,10 +559,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -570,50 +570,50 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
 
 define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -621,10 +621,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp
 
 define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -632,70 +632,70 @@ define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacec
 
 define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -704,10 +704,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(
 
 define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -716,10 +716,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_
 
 define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -728,10 +728,10 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5)
 
 define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -742,10 +742,10 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_
 
 define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @without_alloca(i1 true)
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -753,10 +753,10 @@ define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr
 
 define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @without_alloca(i1 true)
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -764,10 +764,10 @@ define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kern
 
 define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -775,10 +775,10 @@ define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -790,10 +790,10 @@ define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(
 
 define void @with_indirect_call() {
 ; GFX9-LABEL: define void @with_indirect_call()
-; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+; GFX9-SAME:  #[[ATTR6_GFX9_IND_CALL:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_indirect_call()
-; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]] {
+; GFX10-SAME:  #[[ATTR6_GFX10_IND_CALL:[0-9]+]] {
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   ret void
@@ -801,10 +801,10 @@ define void @with_indirect_call() {
 
 define amdgpu_kernel void @with_indirect_call_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR7_GFX9_IND_CALL2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR7_GFX10_IND_CALL2:[0-9]+]]
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   ret void
@@ -812,20 +812,20 @@ define amdgpu_kernel void @with_indirect_call_cc_kernel() {
 
 define void @call_with_indirect_call() {
 ; GFX9-LABEL: define void @call_with_indirect_call()
-; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+; GFX9-SAME:  #[[ATTR6_GFX9_IND_CALL]]
 ;
 ; GFX10-LABEL: define void @call_with_indirect_call()
-; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]]
+; GFX10-SAME:  #[[ATTR6_GFX10_IND_CALL]]
   call void @with_indirect_call()
   ret void
 }
 
 define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR7_GFX9_IND_CALL2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR7_GFX10_IND_CALL2]]
   call void @with_indirect_call()
   ret void
 }
@@ -840,10 +840,10 @@ define void @also_empty() {
 
 define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) {
 ; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI3:[0-9]+]]
   %fptr = select i1 %cond, ptr @empty, ptr @also_empty
   call void %fptr()
   ret void
@@ -853,10 +853,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR10_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR10_GFX10_NOFSI4:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) null
   ret void
@@ -864,10 +864,10 @@ define void @use_intrinsic_workitem_id_x() {
 
 define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) null
   ret void
@@ -875,30 +875,30 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 
 define void @call_use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4]]
+; GFX9-SAME:  #[[ATTR10_GFX9_NOFSI4]]
 ;
 ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4]]
+; GFX10-SAME:  #[[ATTR10_GFX10_NOFSI4]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR11_GFX9_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR11_GFX10_NOFSI5:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
   store volatile i32 7, ptr %1, align 4
   ret void
@@ -906,65 +906,65 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
 
 define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @with_inline_asm() {
 ; GFX9-LABEL: with_inline_asm
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3]]
+; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI3]]
 ;
 ; GFX10-LABEL: with_inline_asm
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3]]
+; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI3]]
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
 }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR0_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR1_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR2_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR3_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR4_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR5_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR6_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR7_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR8_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR10_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR11_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
 
 
 
 
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR0_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR1_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR2_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR3_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR4_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR5_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR6_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR7_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR8_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR10_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR11_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index 55113a3b6f263e..c108b93766bcc8 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() {
 }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index df6a75c7376df4..908746a0c7784f 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -43,5 +43,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index 4e3fc24c4d13ad..5326c9e2c12f08 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -102,7 +102,7 @@ entry:
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
-; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
 ; NO: [[META0]] = !{ptr @bar1, ptr @bar2}
 ;.

>From 703eecc9184857e80125b36e1c77f13ca78b55ba Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Wed, 4 Sep 2024 16:50:38 -0700
Subject: [PATCH 07/13] Code improvement in the lambda.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index ddbbb73dd84e14..9a7752b0f63bb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -707,15 +707,16 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       const auto &CB = cast<CallBase>(I);
       const Function *Callee = CB.getCalledFunction();
 
-      if (Callee && Callee->isIntrinsic())
+      // Callee == 0 for inline asm or indirect call with known callees.
+      // In the latter case, updateImpl() already checked the callees and we
+      // know their FLAT_SCRATCH_INIT bit is set.
+      // If function has indirect call with unknown callees, the bit is
+      // already removed in updateImpl() and execution won't reach here.
+      if (!Callee)
+        return true;
+      else
         return Callee->getIntrinsicID() !=
                Intrinsic::amdgcn_addrspacecast_nonnull;
-
-      // Return true for all other cases, including (1)inline asm, (2)direct
-      // call, and (3)indirect call with known callees. For (2) and (3)
-      // updateImpl() already checked the callees and we know their
-      // FLAT_SCRATCH_INIT bit is set.
-      return true;
     };
 
     bool UsedAssumedInformation = false;

>From a5972c657950ec7b62cfbc974dce8764f773abc2 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Wed, 18 Sep 2024 15:55:50 -0700
Subject: [PATCH 08/13] (1) check constant for addrspacecast (2) remove alloca
 related tests.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  41 +-
 .../attributor-flatscratchinit-globalisel.ll  | 443 --------------
 .../AMDGPU/attributor-flatscratchinit.ll      | 556 +++---------------
 3 files changed, 133 insertions(+), 907 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9a7752b0f63bb0..6f64c1cdb8094d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -440,13 +440,24 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       return;
     }
 
+    SmallPtrSet<const Constant *, 8> VisitedConsts;
+
     for (Instruction &I : instructions(F)) {
       if (isa<AddrSpaceCastInst>(I) &&
-          cast<AddrSpaceCastInst &>(I).getSrcAddressSpace() ==
+          cast<AddrSpaceCastInst>(I).getSrcAddressSpace() ==
               AMDGPUAS::PRIVATE_ADDRESS) {
         removeAssumedBits(FLAT_SCRATCH_INIT);
         return;
       }
+      // check for addrSpaceCast in constant expressions
+      for (const Use &U : I.operands()) {
+        if (const auto *C = dyn_cast<Constant>(U)) {
+          if (constHasASCast(C, VisitedConsts)) {
+            removeAssumedBits(FLAT_SCRATCH_INIT);
+            return;
+          }
+        }
+      }
     }
   }
 
@@ -714,9 +725,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       // already removed in updateImpl() and execution won't reach here.
       if (!Callee)
         return true;
-      else
-        return Callee->getIntrinsicID() !=
-               Intrinsic::amdgcn_addrspacecast_nonnull;
+
+      return Callee->getIntrinsicID() !=
+             Intrinsic::amdgcn_addrspacecast_nonnull;
     };
 
     bool UsedAssumedInformation = false;
@@ -726,6 +737,28 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
                                               UsedAssumedInformation);
   }
+
+  bool constHasASCast(const Constant *C,
+                      SmallPtrSetImpl<const Constant *> &Visited) {
+    if (!Visited.insert(C).second)
+      return false;
+
+    if (const auto *CE = dyn_cast<ConstantExpr>(C))
+      if (CE->getOpcode() == Instruction::AddrSpaceCast &&
+          CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+              AMDGPUAS::PRIVATE_ADDRESS)
+        return true;
+
+    for (const Use &U : C->operands()) {
+      const auto *OpC = dyn_cast<Constant>(U);
+      if (!OpC || !Visited.insert(OpC).second)
+        continue;
+
+      if (constHasASCast(OpC, Visited))
+        return true;
+    }
+    return false;
+  }
 };
 
 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index 9efdf3cbb86067..9ae5782298a289 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -1,200 +1,6 @@
 ; Test the generation of the attribute amdgpu-no-flat-scratch-init
 ; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator | FileCheck -check-prefixes=GFX10 %s
 
-;; tests of alloca
-
-define void @without_alloca(i1 %arg0) {
-  store volatile i1 %arg0, ptr addrspace(1) null
-  ret void
-}
-
-define void @with_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
-  store volatile i1 %arg0, ptr addrspace(1) null
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
-
-define amdgpu_vs void @with_alloca_cc_vs() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_gs void @with_alloca_cc_gs() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_ps void @with_alloca_cc_ps() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs void @with_alloca_cc_cs() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_hs void @with_alloca_cc_hs() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_ls void @with_alloca_cc_ls() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_es void @with_alloca_cc_es() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_gfx void @with_alloca_cc_gfx() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define void @call_without_alloca() {
-  call void @without_alloca(i1 true)
-  ret void
-}
-
-define amdgpu_kernel void @call_without_alloca_cc_kernel() {
-  call void @without_alloca(i1 true)
-  ret void
-}
-
-define void @call_with_alloca() {
-  call void @with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_with_alloca_cc_kernel() {
-  call void @with_alloca()
-  ret void
-}
-
-define void @call_both_with_and_without_alloca() {
-  call void @with_alloca()
-  call void @without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
-  call void @with_alloca()
-  call void @without_alloca()
-  ret void
-}
-
-define void @call_call_without_alloca() {
-  call void @call_without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
-  call void @call_without_alloca()
-  ret void
-}
-
-define void @call_call_with_alloca() {
-  call void @call_with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
-  call void @call_with_alloca()
-  ret void
-}
-
-define void @with_alloca_call_without_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @without_alloca()
-  ret void
-}
-
-define void @with_alloca_call_with_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @with_alloca()
-  ret void
-}
-
-define void @with_alloca_call_call_without_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_without_alloca()
-  ret void
-}
-
-define void @with_alloca_call_call_with_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_with_alloca()
-  ret void
-}
-
 ;; tests of addrspacecast
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
@@ -397,32 +203,6 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_
   ret void
 }
 
-;; tests of mixed alloca and addrspacecast
-
-define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
-  call void @without_alloca(i1 true)
-  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-  ret void
-}
-
-define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-  call void @without_alloca(i1 true)
-  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-  ret void
-}
-
-define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
-  call void @without_alloca(i1 true)
-  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-  ret void
-}
-
-define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-  call void @without_alloca(i1 true)
-  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-  ret void
-}
-
 ;; tests of indirect call, intrinsics
 
 @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
@@ -473,201 +253,6 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
   ret void
 }
 
-; GFX10: name:            without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_cc_vs
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_gs
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
-;
-; GFX10: name:            with_alloca_cc_ps
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_cs
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_hs
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
-;
-; GFX10: name:            with_alloca_cc_ls
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_es
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_gfx
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:  psInputAddr:     0
-;
-; GFX10: name:            with_alloca_cc_cs_chain
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
-; GFX10-NEXT:  psInputAddr:     0
-;
-; GFX10: name:            with_alloca_cc_cs_chain_preserve
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
-; GFX10-NEXT:  psInputAddr:     0
-;
-; GFX10: name:            call_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
-;
-; GFX10: name:            call_with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            call_both_with_and_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_both_with_and_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            call_call_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_call_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
-;
-; GFX10: name:            call_call_with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_call_with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_call_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca_call_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_call_with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca_call_with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_call_call_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca_call_call_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_call_call_with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca_call_call_with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
 ; GFX10: name:            without_global_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
@@ -921,34 +506,6 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
 ; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
-; GFX10: name:            call_without_alloca_and_without_addrspacecast
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_without_alloca_and_without_addrspacecast_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            call_without_alloca_and_with_addrspacecast
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_without_alloca_and_with_addrspacecast_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
 ; GFX10: name:            with_indirect_call
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index b33ec280b034c9..2229ce9500cacf 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -2,378 +2,24 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
 
-;; tests of alloca
-
-define void @without_alloca(i1 %arg0) {
-; GFX9-LABEL: define void @without_alloca(i1 %arg0)
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI:[0-9]+]]
-;
-; GFX10-LABEL: define void @without_alloca(i1 %arg0)
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI:[0-9]+]]
-  store volatile i1 %arg0, ptr addrspace(1) null
-  ret void
-}
-
-define void @with_alloca() {
-; GFX9-LABEL: define void @with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
-; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2:[0-9]+]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2:[0-9]+]]
-  store volatile i1 %arg0, ptr addrspace(1) null
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
-
-define amdgpu_vs void @with_alloca_cc_vs() {
-; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS:[0-9]+]]
-;
-; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS:[0-9]+]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_gs void @with_alloca_cc_gs() {
-; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_ps void @with_alloca_cc_ps() {
-; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs void @with_alloca_cc_cs() {
-; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_hs void @with_alloca_cc_hs() {
-; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_ls void @with_alloca_cc_ls() {
-; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_es void @with_alloca_cc_es() {
-; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_gfx void @with_alloca_cc_gfx() {
-; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
-; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2:[0-9]+]]
-;
-; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
-; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2:[0-9]+]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
-; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
-; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2]]
-;
-; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
-; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
-; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
-; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2]]
-;
-; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
-; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define void @call_without_alloca() {
-; GFX9-LABEL: define void @call_without_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_without_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @without_alloca(i1 true)
-  ret void
-}
-
-define amdgpu_kernel void @call_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @without_alloca(i1 true)
-  ret void
-}
-
-define void @call_with_alloca() {
-; GFX9-LABEL: define void @call_with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @with_alloca()
-  ret void
-}
-
-define void @call_both_with_and_without_alloca() {
-; GFX9-LABEL: define void @call_both_with_and_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
-;
-; GFX10-LABEL: define void @call_both_with_and_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
-  call void @with_alloca()
-  call void @without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
-  call void @with_alloca()
-  call void @without_alloca()
-  ret void
-}
-
-define void @call_call_without_alloca() {
-; GFX9-LABEL: define void @call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @call_without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @call_without_alloca()
-  ret void
-}
-
-define void @call_call_with_alloca() {
-; GFX9-LABEL: define void @call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @call_with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @call_with_alloca()
-  ret void
-}
-
-define void @with_alloca_call_without_alloca() {
-; GFX9-LABEL: define void @with_alloca_call_without_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca_call_without_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @without_alloca()
-  ret void
-}
-
-define void @with_alloca_call_with_alloca() {
-; GFX9-LABEL: define void @with_alloca_call_with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca_call_with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @with_alloca()
-  ret void
-}
-
-define void @with_alloca_call_call_without_alloca() {
-; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_without_alloca()
-  ret void
-}
-
-define void @with_alloca_call_call_with_alloca() {
-; GFX9-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_with_alloca()
-  ret void
-}
-
 ;; tests of addrspacecast
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI:[0-9]+]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2:[0-9]+]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
@@ -548,10 +194,10 @@ define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr a
 
 define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -559,10 +205,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -590,30 +236,30 @@ define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(
 
 define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -621,10 +267,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp
 
 define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -652,50 +298,50 @@ define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_ke
 
 define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -704,10 +350,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(
 
 define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -716,10 +362,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_
 
 define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -728,59 +374,55 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5)
 
 define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
-;; tests of mixed alloca and addrspacecast
+;; tests of addrspacecast in a constant
 
-define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @without_alloca(i1 true)
-  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @with_global_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_in_const()
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(1) null to ptr addrspace(0))
   ret void
 }
 
-define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @with_region_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_in_const()
 ; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @without_alloca(i1 true)
-  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(2) null to ptr addrspace(0))
   ret void
 }
 
-define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
-;
-; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
-  call void @without_alloca(i1 true)
-  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @with_local_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @with_local_to_flat_in_const()
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0))
   ret void
 }
 
-define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
-  call void @without_alloca(i1 true)
-  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @with_constant_to_flat_in_const() {
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0))
+  ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_in_const()
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]]
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(5) null to ptr addrspace(0))
+  ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_in_const()
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]]
+  call void @with_private_to_flat_in_const()
   ret void
 }
 
@@ -790,10 +432,10 @@ define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(
 
 define void @with_indirect_call() {
 ; GFX9-LABEL: define void @with_indirect_call()
-; GFX9-SAME:  #[[ATTR6_GFX9_IND_CALL:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_IND_CALL:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_indirect_call()
-; GFX10-SAME:  #[[ATTR6_GFX10_IND_CALL:[0-9]+]] {
+; GFX10-SAME:  #[[ATTR2_GFX10_IND_CALL:[0-9]+]] {
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   ret void
@@ -801,10 +443,10 @@ define void @with_indirect_call() {
 
 define amdgpu_kernel void @with_indirect_call_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
-; GFX9-SAME:  #[[ATTR7_GFX9_IND_CALL2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_IND_CALL2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
-; GFX10-SAME:  #[[ATTR7_GFX10_IND_CALL2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_IND_CALL2:[0-9]+]]
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   ret void
@@ -812,20 +454,20 @@ define amdgpu_kernel void @with_indirect_call_cc_kernel() {
 
 define void @call_with_indirect_call() {
 ; GFX9-LABEL: define void @call_with_indirect_call()
-; GFX9-SAME:  #[[ATTR6_GFX9_IND_CALL]]
+; GFX9-SAME:  #[[ATTR4_GFX9_IND_CALL:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_with_indirect_call()
-; GFX10-SAME:  #[[ATTR6_GFX10_IND_CALL]]
+; GFX10-SAME:  #[[ATTR4_GFX10_IND_CALL:[0-9]+]]
   call void @with_indirect_call()
   ret void
 }
 
 define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
-; GFX9-SAME:  #[[ATTR7_GFX9_IND_CALL2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_IND_CALL2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
-; GFX10-SAME:  #[[ATTR7_GFX10_IND_CALL2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_IND_CALL2:[0-9]+]]
   call void @with_indirect_call()
   ret void
 }
@@ -840,10 +482,10 @@ define void @also_empty() {
 
 define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) {
 ; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
-; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR6_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
-; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR6_GFX10_NOFSI3:[0-9]+]]
   %fptr = select i1 %cond, ptr @empty, ptr @also_empty
   call void %fptr()
   ret void
@@ -853,10 +495,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR10_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR10_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI4:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) null
   ret void
@@ -875,30 +517,30 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 
 define void @call_use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR10_GFX9_NOFSI4]]
+; GFX9-SAME:  #[[ATTR6_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR10_GFX10_NOFSI4]]
+; GFX10-SAME:  #[[ATTR6_GFX10_NOFSI4:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR11_GFX9_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR9_GFX9_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR11_GFX10_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR9_GFX10_NOFSI5:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
   store volatile i32 7, ptr %1, align 4
   ret void
@@ -906,20 +548,20 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
 
 define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @with_inline_asm() {
 ; GFX9-LABEL: with_inline_asm
-; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI3]]
+; GFX9-SAME:  #[[ATTR6_GFX9_NOFSI3]]
 ;
 ; GFX10-LABEL: with_inline_asm
-; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI3]]
+; GFX10-SAME:  #[[ATTR6_GFX10_NOFSI3]]
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
 }
@@ -928,21 +570,18 @@ define amdgpu_kernel void @with_inline_asm() {
 
 ; GFX9:  attributes #[[ATTR1_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR2_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR3_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR4_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR2_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR5_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR3_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR6_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR7_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR4_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR5_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR8_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR6_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR10_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR8_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR11_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR9_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
 
 
@@ -953,18 +592,15 @@ define amdgpu_kernel void @with_inline_asm() {
 
 ; GFX10:  attributes #[[ATTR1_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR2_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR3_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-
-; GFX10:  attributes #[[ATTR4_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR2_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR5_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR3_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR6_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR7_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR4_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR5_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR8_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR6_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR10_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR8_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR11_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR9_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }

>From b3c81f9ea990305d77c0d0fc940ae355d1a7301f Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Fri, 4 Oct 2024 15:22:12 -0700
Subject: [PATCH 09/13] Add a testcase for addrspacecast inside a constant.

---
 .../AMDGPU/attributor-flatscratchinit.ll      | 44 +++----------------
 1 file changed, 7 insertions(+), 37 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index 2229ce9500cacf..6984bf45a4bb41 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -386,43 +386,13 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_
 
 ;; tests of addrspacecast in a constant
 
-define amdgpu_kernel void @with_global_to_flat_in_const() {
-; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_in_const()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(1) null to ptr addrspace(0))
-  ret void
-}
-
-define amdgpu_kernel void @with_region_to_flat_in_const() {
-; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_in_const()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(2) null to ptr addrspace(0))
-  ret void
-}
-
-define amdgpu_kernel void @with_local_to_flat_in_const() {
-; GFX9-LABEL: define amdgpu_kernel void @with_local_to_flat_in_const()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0))
-  ret void
-}
-
-define amdgpu_kernel void @with_constant_to_flat_in_const() {
-  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0))
-  ret void
-}
-
-define amdgpu_kernel void @with_private_to_flat_in_const() {
-; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_in_const()
-; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]]
-  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(5) null to ptr addrspace(0))
-  ret void
-}
-
-define amdgpu_kernel void @call_with_private_to_flat_in_const() {
-; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_in_const()
-; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]]
-  call void @with_private_to_flat_in_const()
+define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) {
+; GFX9-LABEL: define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out)
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out)
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
+  store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8
   ret void
 }
 

>From 09012f48c8a5242db946656c905a60df66b820be Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Mon, 7 Oct 2024 10:40:51 -0700
Subject: [PATCH 10/13] Update tests after merging from main.

---
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll     |  41 +-
 ...amdgpu-codegenprepare-fold-binop-select.ll |   7 +-
 .../AMDGPU/annotate-kernel-features-hsa.ll    |   9 +-
 .../AMDGPU/call-graph-register-usage.ll       |  12 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll  |  52 +--
 llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll   |  24 +-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |   2 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |   1 +
 ...al-regcopy-and-spill-missed-at-regalloc.ll |  27 +-
 .../AMDGPU/remove-no-kernel-id-attribute.ll   |   2 +-
 .../scc-clobbered-sgpr-to-vmem-spill.ll       | 442 +++++++++---------
 .../CodeGen/AMDGPU/spill-vector-superclass.ll |   6 +-
 12 files changed, 309 insertions(+), 316 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 62f6890e92662e..7336543b41cbc8 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -63,16 +63,23 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
 
 ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
 
-; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
-; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
-; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
-; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
+; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
+; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
+
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
+; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
+
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9: s_cmp_lg_u32 [[PTR]], -1
+; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
+; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
+
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
 
 ; HSA:  .amdhsa_user_sgpr_private_segment_buffer 1
 ; HSA:  .amdhsa_user_sgpr_dispatch_ptr 0
@@ -252,11 +259,8 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
 
 ; FIXME: Shouldn't need to enable queue ptr
 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
-; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
-; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
-; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
@@ -277,12 +281,7 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
 
 ; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
 
-; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
-; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
-; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index bf72cccd912cee..de318e7ae31a5b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -393,12 +393,11 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; GCN-LABEL: select_add_lhs_const_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x83
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x80
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s0, 0
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    s_movk_i32 s0, 0x80
+; GCN-NEXT:    s_cselect_b32 s0, s0, 0x83
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    flat_store_short v[0:1], v0
 ; GCN-NEXT:    s_endpgm
   %select = select i1 %cond, i16 5, i16 8
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index bab5b5c31714c6..239bdfde323cf5 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -443,7 +443,8 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
-; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[PTR]], align 4
+; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
+; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -485,7 +486,8 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[PTR]], align 4
+; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -502,7 +504,8 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
-; ATTRIBUTOR_HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4
+; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
+; ATTRIBUTOR_HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 51fff3444324fa..dbd00f09943c01 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -33,9 +33,9 @@ define void @indirect_use_vcc() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
-; CI: ; NumSgprs: 38
-; VI-NOBUG: ; NumSgprs: 40
-; VI-BUG: ; NumSgprs: 96
+; CI: ; TotalNumSgprs: 38
+; VI-NOBUG: ; TotalNumSgprs: 40
+; VI-BUG: ; TotalNumSgprs: 96
 ; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
   call void @indirect_use_vcc()
@@ -121,9 +121,9 @@ define void @indirect_use_80_sgpr() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
-; CI: ; NumSgprs: 84
-; VI-NOBUG: ; NumSgprs: 86
-; VI-BUG: ; NumSgprs: 96
+; CI: ; TotalNumSgprs: 84
+; VI-NOBUG: ; TotalNumSgprs: 86
+; VI-BUG: ; TotalNumSgprs: 96
 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
   call void @indirect_use_80_sgpr()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index e4ffedd686ac93..6deab28fbe2e98 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -25,11 +25,11 @@
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; NumSgprs: 8
-; VI-NOXNACK: ; NumSgprs: 8
-; VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 8
+; CI: ; TotalNumSgprs: 8
+; VI-NOXNACK: ; TotalNumSgprs: 8
+; VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -42,11 +42,11 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; NumSgprs: 10
-; VI-NOXNACK: ; NumSgprs: 10
-; VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 10
+; CI: ; TotalNumSgprs: 10
+; VI-NOXNACK: ; TotalNumSgprs: 10
+; VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -59,11 +59,11 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; NumSgprs: 12
-; VI-NOXNACK: ; NumSgprs: 14
-; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 8
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -76,11 +76,11 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; NumSgprs: 12
-; VI-NOXNACK: ; NumSgprs: 14
-; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 10
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -99,8 +99,8 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -116,8 +116,8 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -133,8 +133,8 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 4d62d30a38ed34..292722c2607add 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %11
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %9
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %9
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %9
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %11
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %11
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %11
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %9
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %9
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6619145 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -46,16 +46,16 @@ define amdgpu_kernel void @v_input_output_i128() {
 define amdgpu_kernel void @a_input_output_i128() {
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %11
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %11
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %11
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %9
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %9
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 758e8b6e1e2ee1..957f404c8cdbed 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
 ; GCN-NOT: writelane
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; NumSgprs: 37
+; GCN: ; TotalNumSgprs: 37
 ; GCN: ; NumVgprs: 9
 define amdgpu_kernel void @kernel_call() #0 {
   %vgpr = load volatile i32, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 02ce58dd754032..05ef2698c1f774 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -591,6 +591,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s4, s2, 24
 ; VI-NEXT:    s_bfe_i32 s5, s2, 0x80010
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 5a8d0f3d0f158c..b40d35dbd8ac64 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -11,19 +11,15 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
   ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %26
-  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %26
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23
-  ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %6
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   S_ENDPGM 0
@@ -60,18 +56,15 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
   ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %25
-  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %25
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23
-  ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %6
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   S_ENDPGM 0
   ;
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 58a6437bcbf696..8792e60bb0ca16 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -178,7 +178,7 @@ define internal void @mutual_recursion_1(i16 %arg) {
 
 define amdgpu_kernel void @kernel_lds_recursion() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion(
-; CHECK-SAME: ) #[[ATTR6:[0-9]+]] !llvm.amdgcn.lds.kernel.id !9 {
+; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ]
 ; CHECK-NEXT:    call void @mutual_recursion_0(i16 0)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 5dcb4b7c979ab1..7f8240eeb98ebf 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -26,40 +26,40 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 2
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 3
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 4
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 5
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 2
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 3
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 4
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 6
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 7
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 8
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 9
+; CHECK-NEXT:    v_writelane_b32 v22, s8, 10
+; CHECK-NEXT:    v_writelane_b32 v22, s9, 11
+; CHECK-NEXT:    v_writelane_b32 v22, s10, 12
+; CHECK-NEXT:    v_writelane_b32 v22, s11, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:19]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 14
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 15
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 16
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 17
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 18
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 19
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 20
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 21
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 22
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 23
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 24
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 25
-; CHECK-NEXT:    v_writelane_b32 v23, s16, 26
-; CHECK-NEXT:    v_writelane_b32 v23, s17, 27
-; CHECK-NEXT:    v_writelane_b32 v23, s18, 28
-; CHECK-NEXT:    v_writelane_b32 v23, s19, 29
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 14
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 15
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 16
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 17
+; CHECK-NEXT:    v_writelane_b32 v22, s8, 18
+; CHECK-NEXT:    v_writelane_b32 v22, s9, 19
+; CHECK-NEXT:    v_writelane_b32 v22, s10, 20
+; CHECK-NEXT:    v_writelane_b32 v22, s11, 21
+; CHECK-NEXT:    v_writelane_b32 v22, s12, 22
+; CHECK-NEXT:    v_writelane_b32 v22, s13, 23
+; CHECK-NEXT:    v_writelane_b32 v22, s14, 24
+; CHECK-NEXT:    v_writelane_b32 v22, s15, 25
+; CHECK-NEXT:    v_writelane_b32 v22, s16, 26
+; CHECK-NEXT:    v_writelane_b32 v22, s17, 27
+; CHECK-NEXT:    v_writelane_b32 v22, s18, 28
+; CHECK-NEXT:    v_writelane_b32 v22, s19, 29
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[42:43]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -69,14 +69,14 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 30
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 31
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 32
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 33
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 34
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 35
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 36
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 37
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 30
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 31
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 32
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 33
+; CHECK-NEXT:    v_writelane_b32 v22, s8, 34
+; CHECK-NEXT:    v_writelane_b32 v22, s9, 35
+; CHECK-NEXT:    v_writelane_b32 v22, s10, 36
+; CHECK-NEXT:    v_writelane_b32 v22, s11, 37
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -94,105 +94,105 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 38
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 39
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 40
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 41
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 42
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 43
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 44
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 45
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 46
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 47
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 48
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 49
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 50
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 51
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 52
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 53
+; CHECK-NEXT:    v_writelane_b32 v22, s0, 38
+; CHECK-NEXT:    v_writelane_b32 v22, s1, 39
+; CHECK-NEXT:    v_writelane_b32 v22, s2, 40
+; CHECK-NEXT:    v_writelane_b32 v22, s3, 41
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 42
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 43
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 44
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 45
+; CHECK-NEXT:    v_writelane_b32 v22, s8, 46
+; CHECK-NEXT:    v_writelane_b32 v22, s9, 47
+; CHECK-NEXT:    v_writelane_b32 v22, s10, 48
+; CHECK-NEXT:    v_writelane_b32 v22, s11, 49
+; CHECK-NEXT:    v_writelane_b32 v22, s12, 50
+; CHECK-NEXT:    v_writelane_b32 v22, s13, 51
+; CHECK-NEXT:    v_writelane_b32 v22, s14, 52
+; CHECK-NEXT:    v_writelane_b32 v22, s15, 53
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 54
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 55
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 56
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 57
+; CHECK-NEXT:    v_writelane_b32 v22, s0, 54
+; CHECK-NEXT:    v_writelane_b32 v22, s1, 55
+; CHECK-NEXT:    v_writelane_b32 v22, s2, 56
+; CHECK-NEXT:    v_writelane_b32 v22, s3, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 58
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 59
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 60
-; CHECK-NEXT:    ; implicit-def: $vgpr0
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 61
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 62
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 0
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 63
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 1
+; CHECK-NEXT:    v_writelane_b32 v22, s0, 58
+; CHECK-NEXT:    v_writelane_b32 v22, s1, 59
+; CHECK-NEXT:    v_writelane_b32 v22, s2, 60
+; CHECK-NEXT:    ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v22, s3, 61
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 62
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 0
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 63
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 17
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 2
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 3
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 4
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 5
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 14
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 15
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 16
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 17
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 19
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 18
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 19
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 23
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 20
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 21
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 22
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 23
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 24
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 25
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 26
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 31
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 24
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 25
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 26
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 27
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 28
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 29
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 30
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 31
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 36
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 37
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 38
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 39
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 40
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 41
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 42
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 43
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 44
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 45
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 46
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 47
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 32
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 33
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 34
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 35
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 36
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 37
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 38
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 39
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 40
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 41
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 42
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 43
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 44
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 45
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 46
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 47
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %ret
 ; CHECK-NEXT:    s_endpgm
@@ -202,51 +202,51 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 2
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 2
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 3
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 4
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 6
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 7
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 8
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 9
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 10
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 12
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 13
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 6
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 7
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 8
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 9
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 10
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 11
+; CHECK-NEXT:    v_readlane_b32 s6, v22, 12
+; CHECK-NEXT:    v_readlane_b32 s7, v22, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 14
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 15
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 16
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 17
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 18
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 19
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 20
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 21
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 22
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 23
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 24
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 25
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 26
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 27
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 28
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 29
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 14
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 15
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 16
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 17
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 18
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 19
+; CHECK-NEXT:    v_readlane_b32 s6, v22, 20
+; CHECK-NEXT:    v_readlane_b32 s7, v22, 21
+; CHECK-NEXT:    v_readlane_b32 s8, v22, 22
+; CHECK-NEXT:    v_readlane_b32 s9, v22, 23
+; CHECK-NEXT:    v_readlane_b32 s10, v22, 24
+; CHECK-NEXT:    v_readlane_b32 s11, v22, 25
+; CHECK-NEXT:    v_readlane_b32 s12, v22, 26
+; CHECK-NEXT:    v_readlane_b32 s13, v22, 27
+; CHECK-NEXT:    v_readlane_b32 s14, v22, 28
+; CHECK-NEXT:    v_readlane_b32 s15, v22, 29
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 30
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 31
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 32
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 33
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 34
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 35
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 36
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 37
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 30
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 31
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 32
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 33
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 34
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 35
+; CHECK-NEXT:    v_readlane_b32 s6, v22, 36
+; CHECK-NEXT:    v_readlane_b32 s7, v22, 37
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[42:43]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -256,10 +256,10 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 38
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 39
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 40
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 41
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 38
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 39
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 40
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -272,100 +272,100 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[44:51]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 42
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 43
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 44
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 45
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 46
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 47
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 48
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 49
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 50
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 51
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 52
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 53
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 42
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 43
+; CHECK-NEXT:    v_readlane_b32 s6, v22, 44
+; CHECK-NEXT:    v_readlane_b32 s7, v22, 45
+; CHECK-NEXT:    v_readlane_b32 s8, v22, 46
+; CHECK-NEXT:    v_readlane_b32 s9, v22, 47
+; CHECK-NEXT:    v_readlane_b32 s10, v22, 48
+; CHECK-NEXT:    v_readlane_b32 s11, v22, 49
+; CHECK-NEXT:    v_readlane_b32 s12, v22, 50
+; CHECK-NEXT:    v_readlane_b32 s13, v22, 51
+; CHECK-NEXT:    v_readlane_b32 s14, v22, 52
+; CHECK-NEXT:    v_readlane_b32 s15, v22, 53
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 54
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 55
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 56
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 57
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 54
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 55
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 56
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 58
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 59
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 60
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 61
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 62
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 63
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 0
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 58
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 59
+; CHECK-NEXT:    v_readlane_b32 s2, v22, 60
+; CHECK-NEXT:    v_readlane_b32 s3, v22, 61
+; CHECK-NEXT:    v_readlane_b32 s4, v22, 62
+; CHECK-NEXT:    v_readlane_b32 s5, v22, 63
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 0
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 3
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 5
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 2
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 6
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 7
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 8
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 9
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 10
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 11
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 12
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 13
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 14
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 15
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 16
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 17
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 18
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 19
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 20
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 21
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 22
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 23
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 24
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 25
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 26
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 24
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 25
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 26
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 27
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 28
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 29
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 30
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 31
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 39
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 40
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 41
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 42
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 43
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 44
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 45
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 46
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 47
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 32
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 33
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 34
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 35
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 36
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 37
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 38
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 39
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 40
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 41
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 42
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 43
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 44
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 45
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 46
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 47
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index b4a981f1db4ec7..882356d994fc68 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,12 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0
-  ; GCN-NEXT:   SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %12.sub0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]]
+  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %12
   ; GCN-NEXT:   S_ENDPGM 0
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0

>From 0a739e9427fa342fe1333e1fb9c196b4d55afb8f Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Mon, 28 Oct 2024 15:39:17 -0700
Subject: [PATCH 11/13] This commit: (1) use checkForAllInstructions() to check
 AddrSpaceCast instructions instead of walking through all instructions (2)
 move constHasASCast() to AMDGPUInformationCache to make use the existing
 function getConstantAccess().

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 66 ++++++++++-----------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 6f64c1cdb8094d..81890d1094bf1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -262,6 +262,18 @@ class AMDGPUInformationCache : public InformationCache {
     return !HasAperture && (Access & ADDR_SPACE_CAST);
   }
 
+  bool constHasASCastFromPrivate(const Constant *C, Function &Fn) {
+    SmallPtrSet<const Constant *, 8> Visited;
+    uint8_t Access = getConstantAccess(C, Visited);
+
+    if (Access & ADDR_SPACE_CAST)
+      if (const auto *CE = dyn_cast<ConstantExpr>(C))
+        if (CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+            AMDGPUAS::PRIVATE_ADDRESS)
+          return true;
+    return false;
+  }
+
 private:
   /// Used to determine if the Constant needs the queue pointer.
   DenseMap<const Constant *, uint8_t> ConstantStatus;
@@ -440,21 +452,31 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       return;
     }
 
-    SmallPtrSet<const Constant *, 8> VisitedConsts;
+    { // FLAT_SCRATCH_INIT
+      auto AddrSpaceCastNotFromPrivate = [&](Instruction &I) {
+        return static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace() !=
+               AMDGPUAS::PRIVATE_ADDRESS;
+      };
 
-    for (Instruction &I : instructions(F)) {
-      if (isa<AddrSpaceCastInst>(I) &&
-          cast<AddrSpaceCastInst>(I).getSrcAddressSpace() ==
-              AMDGPUAS::PRIVATE_ADDRESS) {
+      bool UsedAssumedInformation = false;
+      if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
+                                     {Instruction::AddrSpaceCast},
+                                     UsedAssumedInformation)) {
+        // If there is AddrSpaceCast instruction that casts from PRIVATE_ADDRESS
         removeAssumedBits(FLAT_SCRATCH_INIT);
         return;
       }
-      // check for addrSpaceCast in constant expressions
-      for (const Use &U : I.operands()) {
-        if (const auto *C = dyn_cast<Constant>(U)) {
-          if (constHasASCast(C, VisitedConsts)) {
-            removeAssumedBits(FLAT_SCRATCH_INIT);
-            return;
+
+      auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+      for (Instruction &I : instructions(F)) {
+        // check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
+        for (const Use &U : I.operands()) {
+          if (const auto *C = dyn_cast<Constant>(U)) {
+            if (InfoCache.constHasASCastFromPrivate(C, *F)) {
+              removeAssumedBits(FLAT_SCRATCH_INIT);
+              return;
+            }
           }
         }
       }
@@ -737,28 +759,6 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
                                               UsedAssumedInformation);
   }
-
-  bool constHasASCast(const Constant *C,
-                      SmallPtrSetImpl<const Constant *> &Visited) {
-    if (!Visited.insert(C).second)
-      return false;
-
-    if (const auto *CE = dyn_cast<ConstantExpr>(C))
-      if (CE->getOpcode() == Instruction::AddrSpaceCast &&
-          CE->getOperand(0)->getType()->getPointerAddressSpace() ==
-              AMDGPUAS::PRIVATE_ADDRESS)
-        return true;
-
-    for (const Use &U : C->operands()) {
-      const auto *OpC = dyn_cast<Constant>(U);
-      if (!OpC || !Visited.insert(OpC).second)
-        continue;
-
-      if (constHasASCast(OpC, Visited))
-        return true;
-    }
-    return false;
-  }
 };
 
 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,

>From 944dfc3abbc39ce592c57a415aee326499a7cc16 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Tue, 29 Oct 2024 18:00:41 -0700
Subject: [PATCH 12/13] Move code from initialize() to needFlatScratch() which
 is called in updateImpl().

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 60 ++++++++++-----------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 81890d1094bf1e..0f8a3279ebbff1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -451,36 +451,6 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       indicatePessimisticFixpoint();
       return;
     }
-
-    { // FLAT_SCRATCH_INIT
-      auto AddrSpaceCastNotFromPrivate = [&](Instruction &I) {
-        return static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace() !=
-               AMDGPUAS::PRIVATE_ADDRESS;
-      };
-
-      bool UsedAssumedInformation = false;
-      if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
-                                     {Instruction::AddrSpaceCast},
-                                     UsedAssumedInformation)) {
-        // If there is AddrSpaceCast instruction that casts from PRIVATE_ADDRESS
-        removeAssumedBits(FLAT_SCRATCH_INIT);
-        return;
-      }
-
-      auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
-
-      for (Instruction &I : instructions(F)) {
-        // check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
-        for (const Use &U : I.operands()) {
-          if (const auto *C = dyn_cast<Constant>(U)) {
-            if (InfoCache.constHasASCastFromPrivate(C, *F)) {
-              removeAssumedBits(FLAT_SCRATCH_INIT);
-              return;
-            }
-          }
-        }
-      }
-    }
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
@@ -734,6 +704,34 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
   bool needFlatScratchInit(Attributor &A) {
     assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
 
+    // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
+    // there is a cast from PRIVATE_ADDRESS.
+    auto AddrSpaceCastNotFromPrivate = [&](Instruction &I) {
+      return static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace() !=
+             AMDGPUAS::PRIVATE_ADDRESS;
+    };
+
+    bool UsedAssumedInformation = false;
+    if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
+                                   {Instruction::AddrSpaceCast},
+                                   UsedAssumedInformation))
+      return true;
+
+    // Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+    Function *F = getAssociatedFunction();
+    for (Instruction &I : instructions(F)) {
+      for (const Use &U : I.operands()) {
+        if (const auto *C = dyn_cast<Constant>(U)) {
+          if (InfoCache.constHasASCastFromPrivate(C, *F))
+            return true;
+        }
+      }
+    }
+
+    // Finally check callees.
+
     // This is called on each callee; false means callee shouldn't have
     // no-flat-scratch-init.
     auto CheckForNoFlatScratchInit = [&](Instruction &I) {
@@ -752,7 +750,7 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
              Intrinsic::amdgcn_addrspacecast_nonnull;
     };
 
-    bool UsedAssumedInformation = false;
+    UsedAssumedInformation = false;
     // If any callee is false (i.e. need FlatScratchInit),
     // checkForAllCallLikeInstructions returns false, in which case this
     // function returns true.

>From 6296be119584be71580d2791a39a9ec1b0a9aa20 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 31 Oct 2024 17:06:38 -0700
Subject: [PATCH 13/13] Some minor code changes.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 0f8a3279ebbff1..d8aa721b7ab250 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -262,7 +262,7 @@ class AMDGPUInformationCache : public InformationCache {
     return !HasAperture && (Access & ADDR_SPACE_CAST);
   }
 
-  bool constHasASCastFromPrivate(const Constant *C, Function &Fn) {
+  bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
     SmallPtrSet<const Constant *, 8> Visited;
     uint8_t Access = getConstantAccess(C, Visited);
 
@@ -706,8 +706,8 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
 
     // Check all AddrSpaceCast instructions. FlatScratchInit is needed if
     // there is a cast from PRIVATE_ADDRESS.
-    auto AddrSpaceCastNotFromPrivate = [&](Instruction &I) {
-      return static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace() !=
+    auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
+      return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
              AMDGPUAS::PRIVATE_ADDRESS;
     };
 
@@ -724,7 +724,7 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     for (Instruction &I : instructions(F)) {
       for (const Use &U : I.operands()) {
         if (const auto *C = dyn_cast<Constant>(U)) {
-          if (InfoCache.constHasASCastFromPrivate(C, *F))
+          if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
             return true;
         }
       }



More information about the llvm-commits mailing list