[clang] [llvm] [AMDGPU] Don't allow wgp mode on gfx1250 (PR #153680)
Stanislav Mekhanoshin via cfe-commits
cfe-commits at lists.llvm.org
Thu Aug 14 13:56:54 PDT 2025
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/153680
- gfx1250 only supports cu mode
>From 55224ad674f77ee1d9ffe365e9c8bfd579c7bd59 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 14 Aug 2025 13:36:39 -0700
Subject: [PATCH] [AMDGPU] Don't allow wgp mode on gfx1250
- gfx1250 only supports cu mode
---
clang/test/CodeGenHIP/hip-cumode.hip | 10 +-
clang/test/Driver/hip-macros.hip | 14 +-
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 5 +-
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 5 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 6 +-
.../MCTargetDesc/AMDGPUTargetStreamer.cpp | 3 +-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 21 +-
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 1 +
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 101 +-
llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll | 364 ++---
.../memory-legalizer-local-workgroup.ll | 1211 +++++++++++++++++
llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 288 ++--
llvm/test/MC/AMDGPU/hsa-diag-v4.s | 33 +-
13 files changed, 1623 insertions(+), 439 deletions(-)
diff --git a/clang/test/CodeGenHIP/hip-cumode.hip b/clang/test/CodeGenHIP/hip-cumode.hip
index 1aa1ca7a1a7ee..61fd53c644e8c 100644
--- a/clang/test/CodeGenHIP/hip-cumode.hip
+++ b/clang/test/CodeGenHIP/hip-cumode.hip
@@ -5,14 +5,20 @@
// RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s
// RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
-// RUN: %s 2>&1 | FileCheck --check-prefixes=NOWGP,WARN-CUMODE %s
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=NOWGP,WARN-CUMODE %s
// RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
// RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
// RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
-// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
+// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
+// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=NOWGP,WARN-CUMODE %s
+// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
// NOWGP-NOT: .amdhsa_workgroup_processor_mode
// CUMODE-ON: .amdhsa_workgroup_processor_mode 0
// CUMODE-OFF: .amdhsa_workgroup_processor_mode 1
diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip
index bd93f9985a774..516e01a6c4743 100644
--- a/clang/test/Driver/hip-macros.hip
+++ b/clang/test/Driver/hip-macros.hip
@@ -27,21 +27,27 @@
// RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
// RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
-// RUN: %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
// RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
// RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
// RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
+// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
+// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
// Check no duplicate warnings.
// RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: -mno-cumode -mno-cumode \
-// RUN: %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
-// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
-// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
+// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
+// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
// CUMODE-ON-DAG: #define __AMDGCN_CUMODE__ 1
// CUMODE-OFF-DAG: #define __AMDGCN_CUMODE__ 0
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c7d2d268a2707..188c126cb9fbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1144,8 +1144,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
CreateExpr(STM.getWavefrontSize()), Ctx),
CreateExpr(1ULL << ScratchAlignShift));
- if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
+ if (STM.supportsWGP()) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
+ }
+
+ if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.MemOrdered = 1;
ProgInfo.FwdProgress = 1;
}
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0184075c2c909..951473264d089 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6270,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
ExprVal, ValRange);
} else if (ID == ".amdhsa_workgroup_processor_mode") {
- if (IVersion.Major < 10)
- return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+ if (!supportsWGP(getSTI()))
+ return Error(IDRange.Start,
+ "directive unsupported on " + getSTI().getCPU(), IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
ValRange);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f47ddf5d93ec3..7ca7e8448c63d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -390,7 +390,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// the original value.
bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
- bool supportsWGP() const { return getGeneration() >= GFX10; }
+ bool supportsWGP() const {
+ if (GFX1250Insts)
+ return false;
+ return getGeneration() >= GFX10;
+ }
bool hasIntClamp() const {
return HasIntClamp;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 68302f0dd0d64..1f35e92151bfc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -563,11 +563,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PrintField(KD.compute_pgm_rsrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
- if (IVersion.Major >= 10) {
+ if (AMDGPU::supportsWGP(STI))
PrintField(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
".amdhsa_workgroup_processor_mode");
+ if (IVersion.Major >= 10) {
PrintField(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ec9f1abdd8467..c41d62748c4be 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1167,12 +1167,21 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
// "Per CU" really means "per whatever functional block the waves of a
- // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+ // workgroup must share".
+
+ // GFX12.5 only supports CU mode, which contains four SIMDs.
+ if (isGFX1250(*STI)) {
+ assert(STI->getFeatureBits().test(FeatureCuMode));
+ return 4;
+ }
+
+ // For gfx10 in CU mode the functional block is the CU, which contains
// two SIMDs.
if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
return 2;
- // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
- // two CUs, so a total of four SIMDs.
+
+ // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP
+ // contains two CUs, so a total of four SIMDs.
return 4;
}
@@ -2480,6 +2489,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
}
+bool supportsWGP(const MCSubtargetInfo &STI) {
+ if (isGFX1250(STI))
+ return false;
+ return isGFX10Plus(STI);
+}
+
bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); }
bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 704bf106ace76..befab68bb5698 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1549,6 +1549,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
bool isGFX12(const MCSubtargetInfo &STI);
bool isGFX12Plus(const MCSubtargetInfo &STI);
bool isGFX1250(const MCSubtargetInfo &STI);
+bool supportsWGP(const MCSubtargetInfo &STI);
bool isNotGFX12Plus(const MCSubtargetInfo &STI);
bool isNotGFX11Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 3daae98961bff..01854c8560ce2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2854,89 +2854,90 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
+; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
-; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
-; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, v15
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
-; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
-; GFX1250-NEXT: v_mov_b32_e32 v0, v16
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1
+; GFX1250-NEXT: v_mov_b32_e32 v1, v14
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index f0db321d3931a..e532deaca98a8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
-; Test S_WAIT_XCNT insertion for global_load/store instructions.
+; Test S_WAIT_XCNT insertion for global_load/store clauses.
; Introduced additional operations in between the clauses to have the register dependency
; between the operands of VMEM operations and the def ops of VALU instructions that followed.
@@ -123,29 +123,10 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT: s_clause 0xd
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
-; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224
-; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: v_dual_mov_b32 v39, v4 :: v_dual_mov_b32 v38, v3
+; GCN-SDAG-NEXT: s_clause 0xf
+; GCN-SDAG-NEXT: global_load_b128 v[2:5], v[0:1], off offset:224
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
-; GCN-SDAG-NEXT: s_clause 0xd
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192
; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208
; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:160
@@ -155,138 +136,103 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:96
; GCN-SDAG-NEXT: global_load_b128 v[48:51], v[0:1], off offset:112
; GCN-SDAG-NEXT: global_load_b128 v[52:55], v[0:1], off offset:64
-; GCN-SDAG-NEXT: global_load_b128 v[38:41], v[0:1], off offset:80
-; GCN-SDAG-NEXT: global_load_b128 v[42:45], v[0:1], off offset:32
-; GCN-SDAG-NEXT: global_load_b128 v[56:59], v[0:1], off offset:48
-; GCN-SDAG-NEXT: global_load_b128 v[60:63], v[0:1], off
-; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
-; GCN-SDAG-NEXT: scratch_load_b128 v[6:9], off, s32 offset:56 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:224
-; GCN-SDAG-NEXT: scratch_load_b128 v[6:9], off, s32 offset:72 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: s_clause 0xe
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:240
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:192
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:208
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:160
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:176
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:128
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off offset:144
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[34:37], off offset:96
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[48:51], off offset:112
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[52:55], off offset:64
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[38:41], off offset:80
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[42:45], off offset:32
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[56:59], off offset:48
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[60:63], off
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16
-; GCN-SDAG-NEXT: s_clause 0xd
-; GCN-SDAG-NEXT: scratch_load_b32 v63, off, s32
-; GCN-SDAG-NEXT: scratch_load_b32 v62, off, s32 offset:4
-; GCN-SDAG-NEXT: scratch_load_b32 v61, off, s32 offset:8
-; GCN-SDAG-NEXT: scratch_load_b32 v60, off, s32 offset:12
-; GCN-SDAG-NEXT: scratch_load_b32 v59, off, s32 offset:16
-; GCN-SDAG-NEXT: scratch_load_b32 v58, off, s32 offset:20
-; GCN-SDAG-NEXT: scratch_load_b32 v57, off, s32 offset:24
-; GCN-SDAG-NEXT: scratch_load_b32 v56, off, s32 offset:28
-; GCN-SDAG-NEXT: scratch_load_b32 v45, off, s32 offset:32
-; GCN-SDAG-NEXT: scratch_load_b32 v44, off, s32 offset:36
-; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32 offset:40
-; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:44
-; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:48
-; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:52
-; GCN-SDAG-NEXT: s_wait_xcnt 0xe
-; GCN-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GCN-SDAG-NEXT: global_load_b128 v[64:67], v[0:1], off offset:80
+; GCN-SDAG-NEXT: global_load_b128 v[68:71], v[0:1], off offset:32
+; GCN-SDAG-NEXT: global_load_b128 v[80:83], v[0:1], off offset:48
+; GCN-SDAG-NEXT: global_load_b128 v[84:87], v[0:1], off
+; GCN-SDAG-NEXT: global_load_b128 v[96:99], v[0:1], off offset:16
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xf
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[2:5], off offset:224
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xe
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[6:9], off offset:240
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xd
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[10:13], off offset:192
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xc
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[14:17], off offset:208
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xb
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[18:21], off offset:160
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xa
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[22:25], off offset:176
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x9
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[26:29], off offset:128
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x8
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[30:33], off offset:144
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x7
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[34:37], off offset:96
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x6
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[48:51], off offset:112
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x5
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[52:55], off offset:64
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x4
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[64:67], off offset:80
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x3
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[68:71], off offset:32
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x2
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[80:83], off offset:48
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x1
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[84:87], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[96:99], off offset:16
+; GCN-SDAG-NEXT: s_wait_xcnt 0x10
+; GCN-SDAG-NEXT: v_mov_b32_e32 v0, v98
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GCN-GISEL-LABEL: test_v64i32_load_store:
; GCN-GISEL: ; %bb.0:
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v4
; GCN-GISEL-NEXT: s_clause 0xf
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
-; GCN-GISEL-NEXT: s_wait_xcnt 0x8
-; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
-; GCN-GISEL-NEXT: s_clause 0xe
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48
-; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64
-; GCN-GISEL-NEXT: global_load_b128 v[14:17], v[0:1], off offset:80
-; GCN-GISEL-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
-; GCN-GISEL-NEXT: global_load_b128 v[22:25], v[0:1], off offset:112
-; GCN-GISEL-NEXT: global_load_b128 v[26:29], v[0:1], off offset:128
-; GCN-GISEL-NEXT: global_load_b128 v[30:33], v[0:1], off offset:144
-; GCN-GISEL-NEXT: global_load_b128 v[34:37], v[0:1], off offset:160
-; GCN-GISEL-NEXT: global_load_b128 v[48:51], v[0:1], off offset:176
-; GCN-GISEL-NEXT: global_load_b128 v[52:55], v[0:1], off offset:192
-; GCN-GISEL-NEXT: global_load_b128 v[38:41], v[0:1], off offset:208
-; GCN-GISEL-NEXT: global_load_b128 v[42:45], v[0:1], off offset:224
-; GCN-GISEL-NEXT: global_load_b128 v[56:59], v[0:1], off
-; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16
-; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: s_clause 0x1
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: s_clause 0xe
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[0:3], off offset:32
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[6:9], off offset:48
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[10:13], off offset:64
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[14:17], off offset:80
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[18:21], off offset:96
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[22:25], off offset:112
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[26:29], off offset:128
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[30:33], off offset:144
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[34:37], off offset:160
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[48:51], off offset:176
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[52:55], off offset:192
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[38:41], off offset:208
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[42:45], off offset:224
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[56:59], off
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[60:63], off offset:16
-; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[0:3], off offset:240
-; GCN-GISEL-NEXT: s_wait_xcnt 0x0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v0, v62
-; GCN-GISEL-NEXT: s_clause 0xf
-; GCN-GISEL-NEXT: scratch_load_b32 v63, off, s32
-; GCN-GISEL-NEXT: scratch_load_b32 v62, off, s32 offset:4
-; GCN-GISEL-NEXT: scratch_load_b32 v61, off, s32 offset:8
-; GCN-GISEL-NEXT: scratch_load_b32 v60, off, s32 offset:12
-; GCN-GISEL-NEXT: scratch_load_b32 v59, off, s32 offset:16
-; GCN-GISEL-NEXT: scratch_load_b32 v58, off, s32 offset:20
-; GCN-GISEL-NEXT: scratch_load_b32 v57, off, s32 offset:24
-; GCN-GISEL-NEXT: scratch_load_b32 v56, off, s32 offset:28
-; GCN-GISEL-NEXT: scratch_load_b32 v47, off, s32 offset:32
-; GCN-GISEL-NEXT: scratch_load_b32 v46, off, s32 offset:36
-; GCN-GISEL-NEXT: scratch_load_b32 v45, off, s32 offset:40
-; GCN-GISEL-NEXT: scratch_load_b32 v44, off, s32 offset:44
-; GCN-GISEL-NEXT: scratch_load_b32 v43, off, s32 offset:48
-; GCN-GISEL-NEXT: scratch_load_b32 v42, off, s32 offset:52
-; GCN-GISEL-NEXT: scratch_load_b32 v41, off, s32 offset:56
-; GCN-GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:60
+; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off
+; GCN-GISEL-NEXT: global_load_b128 v[14:17], v[0:1], off offset:16
+; GCN-GISEL-NEXT: global_load_b128 v[18:21], v[0:1], off offset:64
+; GCN-GISEL-NEXT: global_load_b128 v[22:25], v[0:1], off offset:80
+; GCN-GISEL-NEXT: global_load_b128 v[26:29], v[0:1], off offset:96
+; GCN-GISEL-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112
+; GCN-GISEL-NEXT: global_load_b128 v[34:37], v[0:1], off offset:128
+; GCN-GISEL-NEXT: global_load_b128 v[48:51], v[0:1], off offset:144
+; GCN-GISEL-NEXT: global_load_b128 v[52:55], v[0:1], off offset:160
+; GCN-GISEL-NEXT: global_load_b128 v[64:67], v[0:1], off offset:176
+; GCN-GISEL-NEXT: global_load_b128 v[68:71], v[0:1], off offset:192
+; GCN-GISEL-NEXT: global_load_b128 v[80:83], v[0:1], off offset:208
+; GCN-GISEL-NEXT: global_load_b128 v[84:87], v[0:1], off offset:224
+; GCN-GISEL-NEXT: global_load_b128 v[96:99], v[0:1], off offset:240
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xf
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[2:5], off offset:32
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xe
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[6:9], off offset:48
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xd
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xc
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[14:17], off offset:16
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xb
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[18:21], off offset:64
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xa
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[22:25], off offset:80
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x9
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[26:29], off offset:96
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x8
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[30:33], off offset:112
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x7
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[34:37], off offset:128
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x6
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[48:51], off offset:144
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x5
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[52:55], off offset:160
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x4
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[64:67], off offset:176
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x3
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[68:71], off offset:192
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x2
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[80:83], off offset:208
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x1
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[84:87], off offset:224
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[96:99], off offset:240
+; GCN-GISEL-NEXT: s_wait_xcnt 0x10
+; GCN-GISEL-NEXT: v_mov_b32_e32 v0, v16
; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
store <64 x i32> %vec, ptr addrspace(1) %out, align 4
@@ -299,99 +245,78 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT: s_clause 0x3
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: s_clause 0x7
-; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112
-; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
-; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
-; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:48
-; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:32
+; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:112
+; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:96
+; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:80
+; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:48
+; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:32
; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16
; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off
; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[24:25], 0x70
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 0x60
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 48
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 32
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[40:41], 16
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x50
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[42:43], 0
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 64
-; GCN-SDAG-NEXT: v_dual_mov_b32 v22, 0xc8 :: v_dual_mov_b32 v23, 0
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x70
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64
+; GCN-SDAG-NEXT: v_dual_mov_b32 v34, 0xc8 :: v_dual_mov_b32 v35, 0
; GCN-SDAG-NEXT: s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT: global_store_b128 v[24:25], v[10:13], off
+; GCN-SDAG-NEXT: global_store_b128 v[36:37], v[6:9], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x6
-; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[18:21], off
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[10:13], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x5
; GCN-SDAG-NEXT: s_wait_xcnt 0x1
-; GCN-SDAG-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9
+; GCN-SDAG-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v37, v17
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
; GCN-SDAG-NEXT: s_wait_loadcnt 0x4
-; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT: global_store_b128 v[48:49], v[18:21], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x3
-; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[14:17], off
+; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[22:25], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x2
-; GCN-SDAG-NEXT: global_store_b128 v[40:41], v[26:29], off
+; GCN-SDAG-NEXT: global_store_b128 v[64:65], v[26:29], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT: global_store_b128 v[42:43], v[30:33], off
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: s_wait_xcnt 0x3
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[52:53], v[2:3], v[2:3]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[0:1], v[0:1]
+; GCN-SDAG-NEXT: global_store_b128 v[66:67], v[30:33], off
+; GCN-SDAG-NEXT: s_wait_xcnt 0x0
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
-; GCN-SDAG-NEXT: s_wait_xcnt 0x2
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], 0x64, v[16:17]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
-; GCN-SDAG-NEXT: s_wait_xcnt 0x1
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7]
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], 0xc8, v[14:15]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[24:25], 0x64, v[24:25]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
-; GCN-SDAG-NEXT: s_wait_xcnt 0x0
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
; GCN-SDAG-NEXT: s_clause 0x1
-; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[22:25], off
-; GCN-SDAG-NEXT: global_store_b128 v[48:49], v[0:3], off
+; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[0:3], off
; GCN-SDAG-NEXT: s_clause 0x7
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:96
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:112
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[50:53], off offset:64
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[34:37], off offset:48
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[48:51], off offset:64
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:80
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:32
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:16
-; GCN-SDAG-NEXT: s_clause 0x3
-; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32
-; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:4
-; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:8
-; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:12
-; GCN-SDAG-NEXT: s_wait_xcnt 0xc
+; GCN-SDAG-NEXT: s_wait_xcnt 0x8
; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GCN-GISEL-LABEL: test_v16i64_load_store:
; GCN-GISEL: ; %bb.0:
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
-; GCN-GISEL-NEXT: s_clause 0x5
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_clause 0x7
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off
@@ -405,11 +330,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: v_mov_b64_e32 v[48:49], 16
; GCN-GISEL-NEXT: v_mov_b64_e32 v[50:51], 32
; GCN-GISEL-NEXT: v_mov_b64_e32 v[52:53], 48
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x60
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[66:67], 0x60
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[68:69], 0x70
; GCN-GISEL-NEXT: v_mov_b64_e32 v[54:55], 64
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x70
; GCN-GISEL-NEXT: v_mov_b64_e32 v[34:35], 0xc8
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[40:41], 0x50
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[64:65], 0x50
; GCN-GISEL-NEXT: s_wait_loadcnt 0x6
; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off
; GCN-GISEL-NEXT: s_wait_loadcnt 0x5
@@ -419,13 +344,13 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: s_wait_loadcnt 0x3
; GCN-GISEL-NEXT: global_store_b128 v[52:53], v[22:25], off
; GCN-GISEL-NEXT: s_wait_loadcnt 0x2
-; GCN-GISEL-NEXT: global_store_b128 v[42:43], v[26:29], off
+; GCN-GISEL-NEXT: global_store_b128 v[66:67], v[26:29], off
; GCN-GISEL-NEXT: s_wait_loadcnt 0x1
-; GCN-GISEL-NEXT: global_store_b128 v[44:45], v[30:33], off
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT: global_store_b128 v[68:69], v[30:33], off
; GCN-GISEL-NEXT: s_wait_xcnt 0x5
-; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
; GCN-GISEL-NEXT: s_wait_xcnt 0x4
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
@@ -448,7 +373,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
; GCN-GISEL-NEXT: s_clause 0x1
; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[0:3], off
-; GCN-GISEL-NEXT: global_store_b128 v[40:41], v[34:37], off
+; GCN-GISEL-NEXT: global_store_b128 v[64:65], v[34:37], off
; GCN-GISEL-NEXT: s_clause 0x7
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:16
@@ -458,15 +383,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[26:29], off offset:96
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[30:33], off offset:112
-; GCN-GISEL-NEXT: s_clause 0x5
-; GCN-GISEL-NEXT: scratch_load_b32 v45, off, s32
-; GCN-GISEL-NEXT: scratch_load_b32 v44, off, s32 offset:4
-; GCN-GISEL-NEXT: scratch_load_b32 v43, off, s32 offset:8
-; GCN-GISEL-NEXT: scratch_load_b32 v42, off, s32 offset:12
-; GCN-GISEL-NEXT: scratch_load_b32 v41, off, s32 offset:16
-; GCN-GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:20
+; GCN-GISEL-NEXT: s_wait_xcnt 0x9
; GCN-GISEL-NEXT: v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
%a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4
%in_a = insertelement <16 x i64> %a, i64 100, i32 5
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 62d7f4801baf8..94f5aab1eb67d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s
define amdgpu_kernel void @local_workgroup_unordered_load(
; GFX6-LABEL: local_workgroup_unordered_load:
@@ -177,6 +178,18 @@ define amdgpu_kernel void @local_workgroup_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_unordered_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") unordered, align 4
@@ -348,6 +361,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") monotonic, align 4
@@ -524,6 +549,18 @@ define amdgpu_kernel void @local_workgroup_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4
@@ -718,6 +755,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4
@@ -859,6 +909,16 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") unordered, align 4
@@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") monotonic, align 4
@@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
@@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4
@@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") monotonic
@@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire
@@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release
@@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire
@@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") unordered, align 4
@@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") monotonic, align 4
@@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") acquire, align 4
@@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") unordered, align 4
@@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") monotonic, align 4
@@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") release, align 4
@@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") release
@@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 8304be958f1ad..f78168ba29ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -343,66 +343,66 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v40, 7, v0
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v40, s[34:35] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v40, s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v40, s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v40, s[34:35]
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v40, s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v40, s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v40, s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v40, s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[34:35]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39]
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[42:43]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[0:1]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[36:37]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[16:17]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[40:41]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[38:39]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[34:35]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[38:39]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[34:35]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[54:55]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[56:57]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[54:55]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[50:51]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[40:41]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[46:47]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[48:49]
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[42:43]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[48:49]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[50:51]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[44:45]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[46:47]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[32:33]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[44:45]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[16:19], s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[12:15], s[34:35] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[8:11], s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[4:7], s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[20:23], s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[24:27], s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[0:3], s[34:35]
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[28:31], s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v32_vs:
@@ -1600,66 +1600,66 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v40, 7, v0
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v40, s[34:35] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v40, s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v40, s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v40, s[34:35]
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v40, s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v40, s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v40, s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v40, s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[34:35]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[42:43]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[0:1]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[36:37]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[16:17]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[40:41]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[38:39]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[34:35]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[38:39]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[34:35]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[54:55]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[56:57]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[54:55]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[50:51]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[40:41]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[46:47]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[48:49]
; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[42:43]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[48:49]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[50:51]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[44:45]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[46:47]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[32:33]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[44:45]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[16:19], s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[12:15], s[34:35] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[8:11], s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[4:7], s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[20:23], s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[24:27], s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[0:3], s[34:35]
-; GFX1250-SDAG-NEXT: global_store_b128 v40, v[28:31], s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fmul_v32_vs:
@@ -2431,65 +2431,65 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v34, 7, v0
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v34, s[34:35] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v34, s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v34, s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v34, s[34:35]
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v34, s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v34, s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v34, s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v34, s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[30:31]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[28:29]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[12:13]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[56:57], s[14:15]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[2:3]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[4:5]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[6:7]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[24:25]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[26:27]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[20:21]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[22:23]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[4:5]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[6:7]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[24:25]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[26:27]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[18:19]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[8:9]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[10:11]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[42:43], v[42:43]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[0:1]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41]
-; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[16:17]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[54:55], v[54:55]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[36:37], v[36:37]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[38:39], v[38:39]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[56:57], v[56:57]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[52:53], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[34:35], v[34:35]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[54:55], v[54:55]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[50:51], v[50:51]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[46:47], v[46:47]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[48:49], v[48:49]
; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[42:43], v[42:43]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[48:49], v[48:49]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[50:51], v[50:51]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[44:45], v[44:45]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[46:47], v[46:47]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[32:33], v[32:33]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[44:45], v[44:45]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v34, v[16:19], s[34:35] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v34, v[12:15], s[34:35] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v34, v[8:11], s[34:35] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v34, v[4:7], s[34:35] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v34, v[20:23], s[34:35] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v34, v[24:27], s[34:35] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v34, v[0:3], s[34:35]
-; GFX1250-SDAG-NEXT: global_store_b128 v34, v[28:31], s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fma_v32_vs:
diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
index 9ab177cf2b125..44fe55ef6e9ba 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
@@ -1,9 +1,10 @@
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX8,PREGFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx810 --check-prefixes=ALL,GCN,GFX8,PREGFX10,NOWGP,AMDHSA
// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX10PLUS,GFX10,AMDHSA
// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX10PLUS,GFX11,AMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX10PLUS,GFX12,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx1200 --check-prefixes=ALL,GCN,GFX10PLUS,GFX12,AMDHSA
// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,NONAMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GFX90A,PREGFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx90a --check-prefixes=ALL,GFX90A,PREGFX10,NOWGP,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1250 -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx1250 --check-prefixes=ALL,GCN,GFX10PLUS,GFX12,NOWGP,AMDHSA
.text
@@ -11,7 +12,7 @@
// GFX8-NOT: error:
// GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1010:xnack+
// GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1100
-// GFX12: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1200
+// GFX12: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--[[MCPU]]
// NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-unknown--gfx810
.warning "test_target"
.amdgcn_target "amdgcn-amd-amdhsa--gfx810:xnack+"
@@ -176,8 +177,7 @@
.end_amdhsa_kernel
// GCN-LABEL: warning: test_amdhsa_workgroup_processor_mode
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: .amdhsa_next_free_vgpr directive is required
+// NOWGP: error: directive unsupported on [[MCPU]]
// NONAMDHSA: error: unknown directive
.warning "test_amdhsa_workgroup_processor_mode"
.amdhsa_kernel test_amdhsa_workgroup_processor_mode
@@ -185,8 +185,7 @@
.end_amdhsa_kernel
// GCN-LABEL: warning: test_amdhsa_workgroup_processor_mode_invalid
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: value out of range
+// NOWGP: error: directive unsupported on [[MCPU]]
// NONAMDHSA: error: unknown directive
.warning "test_amdhsa_workgroup_processor_mode_invalid"
.amdhsa_kernel test_amdhsa_workgroup_processor_mode_invalid
@@ -290,6 +289,24 @@
.amdhsa_inst_pref_size 15
.end_amdhsa_kernel
+// GCN-LABEL: warning: test_amdhsa_dx10_clamp_bit
+// GFX12: error: directive unsupported on gfx12+
+.warning "test_amdhsa_dx10_clamp_bit"
+.amdhsa_kernel test_amdhsa_dx10_clamp_bit
+ .amdhsa_next_free_vgpr 32
+ .amdhsa_next_free_sgpr 0
+ .amdhsa_dx10_clamp 1
+.end_amdhsa_kernel
+
+// GCN-LABEL: warning: test_amdhsa_ieee_mode_bit
+// GFX12: error: directive unsupported on gfx12+
+.warning "test_amdhsa_ieee_mode_bit"
+.amdhsa_kernel test_amdhsa_ieee_mode_bit
+ .amdhsa_next_free_vgpr 32
+ .amdhsa_next_free_sgpr 0
+ .amdhsa_ieee_mode 1
+.end_amdhsa_kernel
+
// GCN-LABEL: warning: test_next_free_vgpr_invalid
// AMDHSA: error: .amdgcn.next_free_{v,s}gpr symbols must be absolute expressions
// NONAMDHSA-NOT: error:
More information about the cfe-commits
mailing list