[clang] [llvm] [AMDGPU] Don't allow wgp mode on gfx1250 (PR #153680)

Stanislav Mekhanoshin via cfe-commits cfe-commits at lists.llvm.org
Thu Aug 14 13:56:54 PDT 2025


https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/153680

- gfx1250 only supports cu mode

>From 55224ad674f77ee1d9ffe365e9c8bfd579c7bd59 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 14 Aug 2025 13:36:39 -0700
Subject: [PATCH] [AMDGPU] Don't allow wgp mode on gfx1250

- gfx1250 only supports cu mode
---
 clang/test/CodeGenHIP/hip-cumode.hip          |   10 +-
 clang/test/Driver/hip-macros.hip              |   14 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |    5 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |    5 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    6 +-
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |    3 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   21 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |    1 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    |  101 +-
 llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll  |  364 ++---
 .../memory-legalizer-local-workgroup.ll       | 1211 +++++++++++++++++
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |  288 ++--
 llvm/test/MC/AMDGPU/hsa-diag-v4.s             |   33 +-
 13 files changed, 1623 insertions(+), 439 deletions(-)

diff --git a/clang/test/CodeGenHIP/hip-cumode.hip b/clang/test/CodeGenHIP/hip-cumode.hip
index 1aa1ca7a1a7ee..61fd53c644e8c 100644
--- a/clang/test/CodeGenHIP/hip-cumode.hip
+++ b/clang/test/CodeGenHIP/hip-cumode.hip
@@ -5,14 +5,20 @@
 // RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=NOWGP %s
 // RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
-// RUN:   %s 2>&1 | FileCheck --check-prefixes=NOWGP,WARN-CUMODE %s
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=NOWGP,WARN-CUMODE %s
 // RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
 // RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
-// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=NOWGP %s
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=NOWGP %s
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=NOWGP,WARN-CUMODE %s
+// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
 // NOWGP-NOT: .amdhsa_workgroup_processor_mode
 // CUMODE-ON: .amdhsa_workgroup_processor_mode 0
 // CUMODE-OFF: .amdhsa_workgroup_processor_mode 1
diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip
index bd93f9985a774..516e01a6c4743 100644
--- a/clang/test/Driver/hip-macros.hip
+++ b/clang/test/Driver/hip-macros.hip
@@ -27,21 +27,27 @@
 // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
-// RUN:   %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
 // RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
 // RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
 
 // Check no duplicate warnings.
 // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   -mno-cumode -mno-cumode \
-// RUN:   %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
 
-// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
-// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
+// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
+// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
 // CUMODE-ON-DAG: #define __AMDGCN_CUMODE__ 1
 // CUMODE-OFF-DAG: #define __AMDGCN_CUMODE__ 0
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c7d2d268a2707..188c126cb9fbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1144,8 +1144,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                               CreateExpr(STM.getWavefrontSize()), Ctx),
       CreateExpr(1ULL << ScratchAlignShift));
 
-  if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
+  if (STM.supportsWGP()) {
     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
+  }
+
+  if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
     ProgInfo.MemOrdered = 1;
     ProgInfo.FwdProgress = 1;
   }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0184075c2c909..951473264d089 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6270,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
                        ExprVal, ValRange);
     } else if (ID == ".amdhsa_workgroup_processor_mode") {
-      if (IVersion.Major < 10)
-        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+      if (!supportsWGP(getSTI()))
+        return Error(IDRange.Start,
+                     "directive unsupported on " + getSTI().getCPU(), IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
                        COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
                        ValRange);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f47ddf5d93ec3..7ca7e8448c63d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -390,7 +390,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// the original value.
   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
 
-  bool supportsWGP() const { return getGeneration() >= GFX10; }
+  bool supportsWGP() const {
+    if (GFX1250Insts)
+      return false;
+    return getGeneration() >= GFX10;
+  }
 
   bool hasIntClamp() const {
     return HasIntClamp;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 68302f0dd0d64..1f35e92151bfc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -563,11 +563,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     PrintField(KD.compute_pgm_rsrc3,
                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
-  if (IVersion.Major >= 10) {
+  if (AMDGPU::supportsWGP(STI))
     PrintField(KD.compute_pgm_rsrc1,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
                ".amdhsa_workgroup_processor_mode");
+  if (IVersion.Major >= 10) {
     PrintField(KD.compute_pgm_rsrc1,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ec9f1abdd8467..c41d62748c4be 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1167,12 +1167,21 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
 
 unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
   // "Per CU" really means "per whatever functional block the waves of a
-  // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+  // workgroup must share".
+
+  // GFX12.5 only supports CU mode, which contains four SIMDs.
+  if (isGFX1250(*STI)) {
+    assert(STI->getFeatureBits().test(FeatureCuMode));
+    return 4;
+  }
+
+  // For gfx10 in CU mode the functional block is the CU, which contains
   // two SIMDs.
   if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
     return 2;
-  // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
-  // two CUs, so a total of four SIMDs.
+
+  // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP
+  // contains two CUs, so a total of four SIMDs.
   return 4;
 }
 
@@ -2480,6 +2489,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
 }
 
+bool supportsWGP(const MCSubtargetInfo &STI) {
+  if (isGFX1250(STI))
+    return false;
+  return isGFX10Plus(STI);
+}
+
 bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); }
 
 bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 704bf106ace76..befab68bb5698 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1549,6 +1549,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
 bool isGFX12(const MCSubtargetInfo &STI);
 bool isGFX12Plus(const MCSubtargetInfo &STI);
 bool isGFX1250(const MCSubtargetInfo &STI);
+bool supportsWGP(const MCSubtargetInfo &STI);
 bool isNotGFX12Plus(const MCSubtargetInfo &STI);
 bool isNotGFX11Plus(const MCSubtargetInfo &STI);
 bool isGCN3Encoding(const MCSubtargetInfo &STI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 3daae98961bff..01854c8560ce2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2854,89 +2854,90 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v0, v14, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v0, v12, 0
+; GFX1250-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
 ; GFX1250-NEXT:    v_mul_lo_u32 v27, v5, v10
 ; GFX1250-NEXT:    v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v14, 0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v16, v12, 0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v0, v10, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v16, v10, 0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
 ; GFX1250-NEXT:    v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
 ; GFX1250-NEXT:    v_mul_lo_u32 v22, v6, v9
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
 ; GFX1250-NEXT:    v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v0, v8, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v8, 0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
 ; GFX1250-NEXT:    v_mul_lo_u32 v20, v2, v13
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT:    v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT:    v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v21, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
-; GFX1250-NEXT:    v_mul_lo_u32 v0, v0, v15
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v21, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT:    v_mul_lo_u32 v2, v16, v15
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX1250-NEXT:    v_mul_lo_u32 v9, v1, v14
+; GFX1250-NEXT:    v_mul_lo_u32 v9, v17, v14
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v23, v0, s2
-; GFX1250-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v9, s5
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v15
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v20, s4
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v29, s3
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v20, s4
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v29, s3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v25, s1
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v27, s0
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v25, s1
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v27, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
-; GFX1250-NEXT:    v_mad_u32 v7, v7, v8, v0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, v16
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT:    v_mad_u32 v7, v7, v8, v1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i256 %num, %den
   ret i256 %result
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index f0db321d3931a..e532deaca98a8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
 
-; Test S_WAIT_XCNT insertion for global_load/store instructions.
+; Test S_WAIT_XCNT insertion for global_load/store clauses.
 ; Introduced additional operations in between the clauses to have the register dependency
 ; between the operands of VMEM operations and the def ops of VALU instructions that followed.
 
@@ -123,29 +123,10 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT:    s_clause 0xd
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v63, s32 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:224
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v39, v4 :: v_dual_mov_b32 v38, v3
+; GCN-SDAG-NEXT:    s_clause 0xf
+; GCN-SDAG-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:224
 ; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:240
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
-; GCN-SDAG-NEXT:    s_clause 0xd
 ; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:192
 ; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:208
 ; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:160
@@ -155,138 +136,103 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
 ; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:96
 ; GCN-SDAG-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:112
 ; GCN-SDAG-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:64
-; GCN-SDAG-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:80
-; GCN-SDAG-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:32
-; GCN-SDAG-NEXT:    global_load_b128 v[56:59], v[0:1], off offset:48
-; GCN-SDAG-NEXT:    global_load_b128 v[60:63], v[0:1], off
-; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
-; GCN-SDAG-NEXT:    scratch_load_b128 v[6:9], off, s32 offset:56 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:224
-; GCN-SDAG-NEXT:    scratch_load_b128 v[6:9], off, s32 offset:72 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    s_clause 0xe
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:240
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:192
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:208
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:160
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[22:25], off offset:176
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:128
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:144
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[34:37], off offset:96
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[48:51], off offset:112
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[52:55], off offset:64
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[38:41], off offset:80
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[42:45], off offset:32
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[56:59], off offset:48
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[60:63], off
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
-; GCN-SDAG-NEXT:    s_clause 0xd
-; GCN-SDAG-NEXT:    scratch_load_b32 v63, off, s32
-; GCN-SDAG-NEXT:    scratch_load_b32 v62, off, s32 offset:4
-; GCN-SDAG-NEXT:    scratch_load_b32 v61, off, s32 offset:8
-; GCN-SDAG-NEXT:    scratch_load_b32 v60, off, s32 offset:12
-; GCN-SDAG-NEXT:    scratch_load_b32 v59, off, s32 offset:16
-; GCN-SDAG-NEXT:    scratch_load_b32 v58, off, s32 offset:20
-; GCN-SDAG-NEXT:    scratch_load_b32 v57, off, s32 offset:24
-; GCN-SDAG-NEXT:    scratch_load_b32 v56, off, s32 offset:28
-; GCN-SDAG-NEXT:    scratch_load_b32 v45, off, s32 offset:32
-; GCN-SDAG-NEXT:    scratch_load_b32 v44, off, s32 offset:36
-; GCN-SDAG-NEXT:    scratch_load_b32 v43, off, s32 offset:40
-; GCN-SDAG-NEXT:    scratch_load_b32 v42, off, s32 offset:44
-; GCN-SDAG-NEXT:    scratch_load_b32 v41, off, s32 offset:48
-; GCN-SDAG-NEXT:    scratch_load_b32 v40, off, s32 offset:52
-; GCN-SDAG-NEXT:    s_wait_xcnt 0xe
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-SDAG-NEXT:    global_load_b128 v[64:67], v[0:1], off offset:80
+; GCN-SDAG-NEXT:    global_load_b128 v[68:71], v[0:1], off offset:32
+; GCN-SDAG-NEXT:    global_load_b128 v[80:83], v[0:1], off offset:48
+; GCN-SDAG-NEXT:    global_load_b128 v[84:87], v[0:1], off
+; GCN-SDAG-NEXT:    global_load_b128 v[96:99], v[0:1], off offset:16
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xf
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[2:5], off offset:224
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xe
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[6:9], off offset:240
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xd
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[10:13], off offset:192
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xc
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[14:17], off offset:208
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xb
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[18:21], off offset:160
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xa
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[22:25], off offset:176
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x9
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[26:29], off offset:128
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x8
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[30:33], off offset:144
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x7
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[34:37], off offset:96
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x6
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[48:51], off offset:112
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x5
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[52:55], off offset:64
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x4
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[64:67], off offset:80
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x3
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[68:71], off offset:32
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x2
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[80:83], off offset:48
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x1
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[84:87], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[96:99], off offset:16
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x10
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, v98
 ; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: test_v64i32_load_store:
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v4
 ; GCN-GISEL-NEXT:    s_clause 0xf
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v63, s32 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    s_wait_xcnt 0x8
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
 ; GCN-GISEL-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:32
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
-; GCN-GISEL-NEXT:    s_clause 0xe
 ; GCN-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:48
-; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:64
-; GCN-GISEL-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:80
-; GCN-GISEL-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:96
-; GCN-GISEL-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:112
-; GCN-GISEL-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:128
-; GCN-GISEL-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:144
-; GCN-GISEL-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:160
-; GCN-GISEL-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:176
-; GCN-GISEL-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:192
-; GCN-GISEL-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:208
-; GCN-GISEL-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:224
-; GCN-GISEL-NEXT:    global_load_b128 v[56:59], v[0:1], off
-; GCN-GISEL-NEXT:    global_load_b128 v[60:63], v[0:1], off offset:16
-; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:240
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    s_clause 0x1
-; GCN-GISEL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    s_clause 0xe
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[0:3], off offset:32
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[6:9], off offset:48
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[10:13], off offset:64
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[14:17], off offset:80
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[18:21], off offset:96
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[22:25], off offset:112
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[26:29], off offset:128
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[30:33], off offset:144
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[34:37], off offset:160
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[48:51], off offset:176
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[52:55], off offset:192
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[38:41], off offset:208
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[42:45], off offset:224
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[56:59], off
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[60:63], off offset:16
-; GCN-GISEL-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[0:3], off offset:240
-; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, v62
-; GCN-GISEL-NEXT:    s_clause 0xf
-; GCN-GISEL-NEXT:    scratch_load_b32 v63, off, s32
-; GCN-GISEL-NEXT:    scratch_load_b32 v62, off, s32 offset:4
-; GCN-GISEL-NEXT:    scratch_load_b32 v61, off, s32 offset:8
-; GCN-GISEL-NEXT:    scratch_load_b32 v60, off, s32 offset:12
-; GCN-GISEL-NEXT:    scratch_load_b32 v59, off, s32 offset:16
-; GCN-GISEL-NEXT:    scratch_load_b32 v58, off, s32 offset:20
-; GCN-GISEL-NEXT:    scratch_load_b32 v57, off, s32 offset:24
-; GCN-GISEL-NEXT:    scratch_load_b32 v56, off, s32 offset:28
-; GCN-GISEL-NEXT:    scratch_load_b32 v47, off, s32 offset:32
-; GCN-GISEL-NEXT:    scratch_load_b32 v46, off, s32 offset:36
-; GCN-GISEL-NEXT:    scratch_load_b32 v45, off, s32 offset:40
-; GCN-GISEL-NEXT:    scratch_load_b32 v44, off, s32 offset:44
-; GCN-GISEL-NEXT:    scratch_load_b32 v43, off, s32 offset:48
-; GCN-GISEL-NEXT:    scratch_load_b32 v42, off, s32 offset:52
-; GCN-GISEL-NEXT:    scratch_load_b32 v41, off, s32 offset:56
-; GCN-GISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:60
+; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off
+; GCN-GISEL-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:16
+; GCN-GISEL-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:64
+; GCN-GISEL-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:80
+; GCN-GISEL-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:96
+; GCN-GISEL-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:112
+; GCN-GISEL-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:128
+; GCN-GISEL-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:144
+; GCN-GISEL-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:160
+; GCN-GISEL-NEXT:    global_load_b128 v[64:67], v[0:1], off offset:176
+; GCN-GISEL-NEXT:    global_load_b128 v[68:71], v[0:1], off offset:192
+; GCN-GISEL-NEXT:    global_load_b128 v[80:83], v[0:1], off offset:208
+; GCN-GISEL-NEXT:    global_load_b128 v[84:87], v[0:1], off offset:224
+; GCN-GISEL-NEXT:    global_load_b128 v[96:99], v[0:1], off offset:240
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xf
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[2:5], off offset:32
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xe
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[6:9], off offset:48
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xd
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[10:13], off
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xc
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[14:17], off offset:16
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xb
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[18:21], off offset:64
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xa
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[22:25], off offset:80
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x9
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[26:29], off offset:96
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x8
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[30:33], off offset:112
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x7
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[34:37], off offset:128
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x6
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[48:51], off offset:144
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x5
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[52:55], off offset:160
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x4
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[64:67], off offset:176
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x3
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[68:71], off offset:192
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x2
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[80:83], off offset:208
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x1
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[84:87], off offset:224
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[96:99], off offset:240
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x10
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, v16
 ; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   store <64 x i32> %vec, ptr addrspace(1) %out, align 4
@@ -299,99 +245,78 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT:    s_clause 0x3
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32 scope:SCOPE_SE
 ; GCN-SDAG-NEXT:    s_clause 0x7
-; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:112
-; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:96
-; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:80
-; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:48
-; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:32
+; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:112
+; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:96
+; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:80
+; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:48
+; GCN-SDAG-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:32
 ; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:16
 ; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off
 ; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:64
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[24:25], 0x70
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 0x60
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 48
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[54:55], 32
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x50
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 64
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v22, 0xc8 :: v_dual_mov_b32 v23, 0
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[36:37], 0x70
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 48
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x60
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 32
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[64:65], 16
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[66:67], 0
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 0x50
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[54:55], 64
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v34, 0xc8 :: v_dual_mov_b32 v35, 0
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT:    global_store_b128 v[24:25], v[10:13], off
+; GCN-SDAG-NEXT:    global_store_b128 v[36:37], v[6:9], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GCN-SDAG-NEXT:    global_store_b128 v[50:51], v[18:21], off
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[10:13], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x5
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v37, v17
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x4
-; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT:    global_store_b128 v[48:49], v[18:21], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x3
-; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[14:17], off
+; GCN-SDAG-NEXT:    global_store_b128 v[50:51], v[22:25], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GCN-SDAG-NEXT:    global_store_b128 v[40:41], v[26:29], off
+; GCN-SDAG-NEXT:    global_store_b128 v[64:65], v[26:29], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT:    global_store_b128 v[42:43], v[30:33], off
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x3
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[52:53], v[2:3], v[2:3]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[50:51], v[0:1], v[0:1]
+; GCN-SDAG-NEXT:    global_store_b128 v[66:67], v[30:33], off
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x2
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[16:17], 0x64, v[16:17]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7]
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[14:15], 0xc8, v[14:15]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[24:25], 0x64, v[24:25]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
 ; GCN-SDAG-NEXT:    s_clause 0x1
-; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[22:25], off
-; GCN-SDAG-NEXT:    global_store_b128 v[48:49], v[0:3], off
+; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[0:3], off
 ; GCN-SDAG-NEXT:    s_clause 0x7
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:96
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:112
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[50:53], off offset:64
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:80
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:32
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[34:37], off offset:48
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:96
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:112
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[48:51], off offset:64
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:80
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[22:25], off offset:32
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:48
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:16
-; GCN-SDAG-NEXT:    s_clause 0x3
-; GCN-SDAG-NEXT:    scratch_load_b32 v43, off, s32
-; GCN-SDAG-NEXT:    scratch_load_b32 v42, off, s32 offset:4
-; GCN-SDAG-NEXT:    scratch_load_b32 v41, off, s32 offset:8
-; GCN-SDAG-NEXT:    scratch_load_b32 v40, off, s32 offset:12
-; GCN-SDAG-NEXT:    s_wait_xcnt 0xc
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x8
 ; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: test_v16i64_load_store:
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GCN-GISEL-NEXT:    s_clause 0x5
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32 scope:SCOPE_SE
 ; GCN-GISEL-NEXT:    s_clause 0x7
 ; GCN-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:80
 ; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off
@@ -405,11 +330,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[48:49], 16
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[50:51], 32
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[52:53], 48
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x60
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[66:67], 0x60
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[68:69], 0x70
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[54:55], 64
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[44:45], 0x70
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[34:35], 0xc8
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[40:41], 0x50
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[64:65], 0x50
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x6
 ; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[10:13], off
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x5
@@ -419,13 +344,13 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x3
 ; GCN-GISEL-NEXT:    global_store_b128 v[52:53], v[22:25], off
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x2
-; GCN-GISEL-NEXT:    global_store_b128 v[42:43], v[26:29], off
+; GCN-GISEL-NEXT:    global_store_b128 v[66:67], v[26:29], off
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x1
-; GCN-GISEL-NEXT:    global_store_b128 v[44:45], v[30:33], off
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT:    global_store_b128 v[68:69], v[30:33], off
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x5
-; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x4
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
@@ -448,7 +373,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
 ; GCN-GISEL-NEXT:    s_clause 0x1
 ; GCN-GISEL-NEXT:    global_store_b128 v[54:55], v[0:3], off
-; GCN-GISEL-NEXT:    global_store_b128 v[40:41], v[34:37], off
+; GCN-GISEL-NEXT:    global_store_b128 v[64:65], v[34:37], off
 ; GCN-GISEL-NEXT:    s_clause 0x7
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[10:13], off
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:16
@@ -458,15 +383,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:80
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:96
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:112
-; GCN-GISEL-NEXT:    s_clause 0x5
-; GCN-GISEL-NEXT:    scratch_load_b32 v45, off, s32
-; GCN-GISEL-NEXT:    scratch_load_b32 v44, off, s32 offset:4
-; GCN-GISEL-NEXT:    scratch_load_b32 v43, off, s32 offset:8
-; GCN-GISEL-NEXT:    scratch_load_b32 v42, off, s32 offset:12
-; GCN-GISEL-NEXT:    scratch_load_b32 v41, off, s32 offset:16
-; GCN-GISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:20
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x9
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4
   %in_a = insertelement <16 x i64> %a, i64 100, i32 5
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 62d7f4801baf8..94f5aab1eb67d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -12,6 +12,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s
 
 define amdgpu_kernel void @local_workgroup_unordered_load(
 ; GFX6-LABEL: local_workgroup_unordered_load:
@@ -177,6 +178,18 @@ define amdgpu_kernel void @local_workgroup_unordered_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_unordered_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") unordered, align 4
@@ -348,6 +361,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") monotonic, align 4
@@ -524,6 +549,18 @@ define amdgpu_kernel void @local_workgroup_acquire_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4
@@ -718,6 +755,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4
@@ -859,6 +909,16 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_unordered_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") unordered, align 4
@@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") monotonic, align 4
@@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
@@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4
@@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") monotonic
@@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire
@@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release
@@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire
@@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") unordered, align 4
@@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") monotonic, align 4
@@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") acquire, align 4
@@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") unordered, align 4
@@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") monotonic, align 4
@@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") release, align 4
@@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") release
@@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 8304be958f1ad..f78168ba29ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -343,66 +343,66 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v40, 7, v0
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v40, s[34:35] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v40, s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v40, s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v40, s[34:35]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v40, s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v40, s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v40, s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v40, s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], v[34:35]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], v[38:39]
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], v[42:43]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[0:1]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], v[36:37]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[16:17]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[40:41]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], v[38:39]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[16:17]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], v[34:35]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], v[38:39]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[34:35]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[54:55]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], v[56:57]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], v[54:55]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], v[50:51]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], v[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], v[46:47]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], v[48:49]
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[42:43]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], v[48:49]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[50:51]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[44:45]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[46:47]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], v[32:33]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[44:45]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[38:39]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[16:19], s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[12:15], s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[8:11], s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[4:7], s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[20:23], s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[24:27], s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[0:3], s[34:35]
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[28:31], s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[34:35] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fadd_v32_vs:
@@ -1600,66 +1600,66 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v40, 7, v0
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v40, s[34:35] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v40, s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v40, s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v40, s[34:35]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v40, s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v40, s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v40, s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v40, s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[34:35]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[42:43]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[0:1]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], v[36:37]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[16:17]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[40:41]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[38:39]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[16:17]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[34:35]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[38:39]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], v[34:35]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[54:55]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[56:57]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[54:55]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[50:51]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], v[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[46:47]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[48:49]
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], v[42:43]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[48:49]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[50:51]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], v[44:45]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], v[46:47]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[32:33]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[44:45]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[38:39]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[16:19], s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[12:15], s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[8:11], s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[4:7], s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[20:23], s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[24:27], s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[0:3], s[34:35]
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[28:31], s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[34:35] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fmul_v32_vs:
@@ -2431,65 +2431,65 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v34, 7, v0
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v34, s[34:35] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v34, s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v34, s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v34, s[34:35]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v34, s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v34, s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v34, s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v34, s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[30:31]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[28:29]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[54:55], s[12:13]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[56:57], s[14:15]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[52:53], s[2:3]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[48:49], s[4:5]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[50:51], s[6:7]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[44:45], s[24:25]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[46:47], s[26:27]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[18:19]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[20:21]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[22:23]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[30:31]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[28:29]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[54:55], s[14:15]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[46:47], s[4:5]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[48:49], s[6:7]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[24:25]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[44:45], s[26:27]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[18:19]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[8:9]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[10:11]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[26:27], v[42:43], v[42:43]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[0:1]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[16:17]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], v[54:55], v[54:55]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], v[36:37], v[36:37]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], v[38:39], v[38:39]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], v[56:57], v[56:57]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], v[52:53], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], v[34:35], v[34:35]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], v[54:55], v[54:55]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[26:27], v[50:51], v[50:51]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], v[46:47], v[46:47]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], v[48:49], v[48:49]
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], v[42:43], v[42:43]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], v[48:49], v[48:49]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], v[50:51], v[50:51]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], v[44:45], v[44:45]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], v[46:47], v[46:47]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], v[32:33], v[32:33]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], v[44:45], v[44:45]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[16:19], s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[12:15], s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[8:11], s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[4:7], s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[20:23], s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[24:27], s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[0:3], s[34:35]
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[28:31], s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[34:35] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fma_v32_vs:
diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
index 9ab177cf2b125..44fe55ef6e9ba 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
@@ -1,9 +1,10 @@
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX8,PREGFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx810 --check-prefixes=ALL,GCN,GFX8,PREGFX10,NOWGP,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX10PLUS,GFX10,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX10PLUS,GFX11,AMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX10PLUS,GFX12,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx1200 --check-prefixes=ALL,GCN,GFX10PLUS,GFX12,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,NONAMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GFX90A,PREGFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx90a --check-prefixes=ALL,GFX90A,PREGFX10,NOWGP,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1250 -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx1250 --check-prefixes=ALL,GCN,GFX10PLUS,GFX12,NOWGP,AMDHSA
 
 .text
 
@@ -11,7 +12,7 @@
 // GFX8-NOT: error:
 // GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1010:xnack+
 // GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1100
-// GFX12: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1200
+// GFX12: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--[[MCPU]]
 // NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-unknown--gfx810
 .warning "test_target"
 .amdgcn_target "amdgcn-amd-amdhsa--gfx810:xnack+"
@@ -176,8 +177,7 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_workgroup_processor_mode
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: .amdhsa_next_free_vgpr directive is required
+// NOWGP: error: directive unsupported on [[MCPU]]
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_workgroup_processor_mode"
 .amdhsa_kernel test_amdhsa_workgroup_processor_mode
@@ -185,8 +185,7 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_workgroup_processor_mode_invalid
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: value out of range
+// NOWGP: error: directive unsupported on [[MCPU]]
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_workgroup_processor_mode_invalid"
 .amdhsa_kernel test_amdhsa_workgroup_processor_mode_invalid
@@ -290,6 +289,24 @@
   .amdhsa_inst_pref_size 15
 .end_amdhsa_kernel
 
+// GCN-LABEL: warning: test_amdhsa_dx10_clamp_bit
+// GFX12: error: directive unsupported on gfx12+
+.warning "test_amdhsa_dx10_clamp_bit"
+.amdhsa_kernel test_amdhsa_dx10_clamp_bit
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_dx10_clamp 1
+.end_amdhsa_kernel
+
+// GCN-LABEL: warning: test_amdhsa_ieee_mode_bit
+// GFX12: error: directive unsupported on gfx12+
+.warning "test_amdhsa_ieee_mode_bit"
+.amdhsa_kernel test_amdhsa_ieee_mode_bit
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_ieee_mode 1
+.end_amdhsa_kernel
+
 // GCN-LABEL: warning: test_next_free_vgpr_invalid
 // AMDHSA: error: .amdgcn.next_free_{v,s}gpr symbols must be absolute expressions
 // NONAMDHSA-NOT: error:



More information about the cfe-commits mailing list